Example #1
0
def preprocess_wiki(input_file, output_file):
    # Import input file
    if not os.path.exists(input_file):
        url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        logging.info('Download Wiki dump from {}'.format(url))
        wget.download(url)
    wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[])

    # Convert tradtional Chinese to simplified Chinese using OpenCC
    cc = OpenCC('t2s')
    # Segment the sentences into words using Jieba paddle mode
    jieba.enable_paddle()

    # Process Wiki text
    logging.info('Start processing Wiki text')
    output = open(output_file, 'w')
    i = 0
    for article in tqdm(wiki.get_texts()):
        raw = ' '.join(article)
        processed = []
        # Remove non-Chinese words
        for token in list(jieba.cut(cc.convert(raw))):
            matched = re.findall(r'[\u4e00-\u9fff]+', token)
            if matched:
                processed.append(matched[0])
        output.write(' '.join(processed) + '\n')
        i += 1
        if (i % 10000 == 0):
            logging.info('Finished processing {} articles'.format(i))
    output.close()
    logging.info('Done')
Example #2
0
 def test_min_token_len_not_set(self):
     """
     don't set the parameter token_min_len and check that 'a' as a token doesn't exists
     default token_min_len=2
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     self.assertTrue(u'a' not in next(wc.get_texts()))
Example #3
0
 def test_max_token_len_set(self):
     """
     set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, token_max_len=16, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'collectivization' in next(l))
Example #4
0
 def test_min_token_len_set(self):
     """
     set the parameter token_min_len to 1 and check that 'a' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, token_min_len=1, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'a' in next(l))
Example #5
0
 def test_min_token_len_not_set(self):
     """
     don't set the parameter token_min_len and check that 'a' as a token doesn't exists
     default token_min_len=2
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     self.assertTrue(u'a' not in next(wc.get_texts()))
Example #6
0
class WikiSentences:
    def __init__(self, wiki_dump_path):
        self.wiki = WikiCorpus(wiki_dump_path)

    def __iter__(self):
        for sentence in self.wiki.get_texts():
            yield sentence
Example #7
0
 def test_max_token_len_not_set(self):
     """
     don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists
     default token_max_len=15
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'collectivization' not in next(l))
 def test_max_token_len_not_set(self):
     """
     don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists
     default token_max_len=15
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'collectivization' not in next(l))
Example #9
0
 def test_max_token_len_set(self):
     """
     set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME),
                     processes=1,
                     token_max_len=16,
                     lemmatize=False)
     self.assertTrue(u'collectivization' in next(wc.get_texts()))
Example #10
0
 def test_lower_case_set_false(self):
     """
     set the parameter lower to False and check that upper case Anarchism' token exist
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False)
     l = wc.get_texts()
     list_tokens = next(l)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
 def test_get_texts_returns_generator_of_lists(self):
     if sys.version_info < (2, 7, 0):
         return
     wc = WikiCorpus(datapath(FILENAME))
     l = wc.get_texts()
     self.assertEqual(type(l), types.GeneratorType)
     first = next(l)
     self.assertEqual(type(first), list)
     self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))
Example #12
0
 def test_lower_case_set_false(self):
     """
     set the parameter lower to False and check that upper case Anarchism' token exist
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False)
     row = wc.get_texts()
     list_tokens = next(row)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
Example #13
0
 def test_lower_case_set_true(self):
     """
     set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lower=True, lemmatize=False)
     row = wc.get_texts()
     list_tokens = next(row)
     self.assertTrue(u'Anarchism' not in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
Example #14
0
 def test_min_token_len_set(self):
     """
     set the parameter token_min_len to 1 and check that 'a' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME),
                     processes=1,
                     token_min_len=1,
                     lemmatize=False)
     self.assertTrue(u'a' in next(wc.get_texts()))
    def test_unicode_element(self):
        """
        First unicode article in this sample is
        1) папа
        """
        wc = WikiCorpus(datapath(FILENAME_U), processes=1)

        l = wc.get_texts()
        self.assertTrue(u'папа' in next(l))
Example #16
0
class WikiCorpus:
    def __init__(self, wiki_dump_path, lang):
        logging.info('Parsing wiki corpus')
        self.wiki = WikiCorpus(wiki_dump_path)
        self.lang = lang

    def __iter__(self):
        for sentence in self.wiki.get_texts():
            yield list(sentence)
Example #17
0
 def test_get_texts_returns_generator_of_lists(self):
     if sys.version_info < (2, 7, 0):
         return
     wc = WikiCorpus(datapath(FILENAME))
     l = wc.get_texts()
     self.assertEqual(type(l), types.GeneratorType)
     first = next(l)
     self.assertEqual(type(first), list)
     self.assertTrue(
         isinstance(first[0], bytes) or isinstance(first[0], str))
    def test_first_element(self):
        """
        First two articles in this sample are
        1) anarchism
        2) autism
        """
        wc = WikiCorpus(datapath(FILENAME), processes=1)

        l = wc.get_texts()
        self.assertTrue(u'anarchism' in next(l))
        self.assertTrue(u'autism' in next(l))
Example #19
0
 def test_custom_tokenizer(self):
     """
     define a custom tokenizer function and use it
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner,
                     token_max_len=16, token_min_len=1, lower=False)
     l = wc.get_texts()
     list_tokens = next(l)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'collectivization' in list_tokens)
     self.assertTrue(u'a' in list_tokens)
     self.assertTrue(u'i.e.' in list_tokens)
Example #20
0
 def test_custom_tokenizer(self):
     """
     define a custom tokenizer function and use it
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner,
                     token_max_len=16, token_min_len=1, lower=False)
     row = wc.get_texts()
     list_tokens = next(row)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'collectivization' in list_tokens)
     self.assertTrue(u'a' in list_tokens)
     self.assertTrue(u'i.e.' in list_tokens)
 def test_lower_case_set_true(self):
     """
     set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
     """
     wc = WikiCorpus(datapath(FILENAME),
                     processes=1,
                     lower=True,
                     lemmatize=False)
     l = wc.get_texts()
     list_tokens = next(l)
     self.assertTrue(u'Anarchism' not in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
Example #22
0
    def test_first_element(self):
        """
        First two articles in this sample are
        1) anarchism
        2) autism
        """
        if sys.version_info < (2, 7, 0):
            return
        wc = WikiCorpus(datapath(FILENAME))

        l = wc.get_texts()
        self.assertTrue(b"anarchism" in next(l))
        self.assertTrue(b"autism" in next(l))
Example #23
0
class WikiSentences:
    # reference: https://github.com/LasseRegin/gensim-word2vec-model/blob/master/train.py
    def __init__(self, wiki_dump_path, lang):
        logging.info('Parsing wiki corpus')
        self.wiki = WikiCorpus(wiki_dump_path)
        self.lang = lang

    def __iter__(self):
        for sentence in self.wiki.get_texts():
            if self.lang == 'zh':
                yield list(jieba.cut(''.join(sentence), cut_all=False))
            else:
                yield list(sentence)
Example #24
0
    def test_first_element(self):
        """
        First two articles in this sample are
        1) anarchism
        2) autism
        """
        if sys.version_info < (2, 7, 0):
            return
        wc = WikiCorpus(datapath(FILENAME))

        l = wc.get_texts()
        self.assertTrue(b"anarchism" in next(l))
        self.assertTrue(b"autism" in next(l))
Example #25
0
def train_and_save_model(articles_path, model_path):
    corpus = WikiCorpus(articles_path, lemmatize=False, dictionary={})
    sentences = list(corpus.get_texts())
    params = {
        'size': 200,
        'window': 10,
        'min_count': 10,
        'workers': max(1,
                       multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
    }
    model = Word2Vec(sentences, **params)
    model.save(model_path)

    return model
 def makeWikiTextEmbedding(self):
     # on wiki text sentences
     wiki = WikiCorpus('data/swwiki-latest-pages-articles.xml.bz2',
                       lemmatize=False,
                       dictionary={})
     sentences = list(wiki.get_texts())
     print("wikitext: ", len(sentences), " sentences")
     self.debugPrintRandomSentences(sentences, 10)
     model = gs.models.Word2Vec(sentences,
                                size=100,
                                window=5,
                                min_count=5,
                                workers=8,
                                sg=1,
                                hs=1,
                                iter=15)
Example #27
0
def main():
    args = setup_args()
    logging.info(args)

    fw = open(args.text, 'w')
    corpus = WikiCorpus(args.dump,
                        dictionary={'a'},
                        tokenizer_func=tokenize_spacy)
    for index, sentences in enumerate(corpus.get_texts()):
        for sentence in sentences:
            fw.write('{}\n'.format(sentence))

        if index % 10000 == 0:
            logging.info('Done Article: {}'.format(index))

    return
def test_get_interesting_proportion():
    dictionary = HashDictionary(id_range=100000)
    dataset = Dataset(settings.dataset)
    terms = dataset.all_lemmas()
    
    wiki = RestrictedWikiCorpus(settings.corpus, terms=terms,
                                dictionary=dictionary)
    
    num_pages = sum(1 for _ in wiki.get_texts())
    
    unrestricted = WikiCorpus(settings.corpus,
                              dictionary=dictionary)

    num_unrestricted_pages = sum(1 for _ in unrestricted.get_texts())
    print "Unrestricted: %d, restricted: %d" % (
        num_unrestricted_pages, num_pages)
Example #29
0
def Word2VecTraining(Size=200, window=7, min_count=5, Language='spanish'):

    #Using WikiCorpus in Spanish Version
    wiki = WikiCorpus(
        'D:/Gita/GITA_Master/Databases/WikiCorpus/eswiki-latest-pages-articles.xml.bz2',
        lemmatize=False,
        dictionary={})
    corpus = list(wiki.get_texts())
    #Defining Paramters
    params = {
        'size': Size,
        'window': window,
        'min_count': min_count,
        'workers': max(1,
                       multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
    }
    #Model Training with WikiCorpus
    word2vec = Word2Vec(corpus, **params)

    ################TODO: Save model####################

    return word2vec
Example #30
0
logger = logging.getLogger("logger")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
wiki = WikiCorpus(sys.argv[1], lemmatize=False, dictionary={})

limit = None
if len(sys.argv) > 2:
    limit = int(sys.argv[2])

i = 0
sentences_out_stream = open("sentences.txt", "w")
sentences_out_stream = open("sentences.txt", "w")

logger.info("building sentences list")
for text in wiki.get_texts():
    while (len(text) > 0):
        slice = text[0:9999]
        text = text[10000:]
        string = " ".join(slice) + "\n"
        sentences_out_stream.write(string)
    i = i + 1
    logger.info("%i articles slices processed", i)
    if limit != None and i > limit:
        break

sentences_out_stream.close()
logger.info("building vectors from sentences")
sentences_in_stream = open("sentences.txt", "rb")
vectors = Word2Vec(LineSentence(sentences_in_stream),
                   size=300,
from gensim.corpora.wikicorpus import WikiCorpus

wiki = WikiCorpus('corpus/arwiki-latest-pages-articles.xml.bz2')

from gensim import utils

corpus = [{
    'id': 'doc_%i' % num,
    'tokens': text
} for num, text in enumerate(wiki.get_texts())]

from PyArabic import ArabicPreprocessor

preprocessor = ArabicPreprocessor()


class LabeledQID(object):
    def __init__(self, filename, qid):
        self.filename = filename
        self.qid = qid

    def __iter__(self):

        # Loading test set
        tree = etree.parse(self.filename)

        # {QID, Qtext} dictionary for questions
        questions = {}

        sentence = tree.xpath('Question[@QID=' + self.qid + ']/Qtext')[0].text
        uid = 0
Example #32
0
from gensim.corpora.wikicorpus import WikiCorpus


wiki = WikiCorpus('', processes=None, lemmatize=False, dictionary=None)
texts = wiki.get_texts()
with open('wikitext.txt', 'w') as wikitext:
    for text in texts:
        wikitext.write(' '.join(text) + "\n")
    
Example #33
0
from gensim.corpora.wikicorpus import WikiCorpus
import time

if __name__ == "__main__":

    print("Importing Wikipedia")
    wiki = WikiCorpus('en/enwiki-latest-pages-articles.xml.bz2',
                      lemmatize=False,
                      dictionary={})

    print("Getting sentences")
    start = time.time()

    with open('sentences.txt', 'w') as f:
        for i, text in enumerate(wiki.get_texts()):
            f.write("%s\n" % text)
            i = i + 1
            if (i % 10000 == 0):
                print('Processed ' + str(i) + ' articles')

    end = time.time()
    print(end - start, ' seconds elapsed')
Example #34
0
def get_text():
    wiki = WikiCorpus('ruwiki-20181020-pages-articles-multistream.xml.bz2 ')
    for text in wiki.get_texts():
        yield [word for word in text]
Example #35
0
from gensim.corpora.wikicorpus import WikiCorpus

wiki = WikiCorpus('', processes=None, lemmatize=False, dictionary=None)
texts = wiki.get_texts()
with open('wikitext.txt', 'w') as wikitext:
    for text in texts:
        wikitext.write(' '.join(text) + "\n")
                    level=logging.INFO)

import datetime

from gensim.corpora.wikicorpus import WikiCorpus

print('Training: Started at {}'.format(datetime.datetime.now().time()))

wiki = WikiCorpus('K:/ruwiki-20191001-pages-articles-multistream.xml.bz2',
                  dictionary=False)

print('Training: Corpus loaded at {}'.format(datetime.datetime.now().time()))

from gensim.models.phrases import Phrases, Phraser

bigram = Phrases(wiki.get_texts(), min_count=30, progress_per=10000)
bigram_transformer = Phraser(bigram)

print('Training: Bigrams processed at {}'.format(
    datetime.datetime.now().time()))


def text_generator_bigram():
    for text in wiki.get_texts():
        yield bigram_transformer[[word.decode('utf-8') for word in text]]


trigram = Phrases(text_generator_bigram(), min_count=30, progress_per=10000)
trigram_transformer = Phraser(trigram)

print('Training: Trigrams processed at {}'.format(
import sys
import json
from os import path
from gensim.corpora.wikicorpus import WikiCorpus

base_dir = path.join(path.dirname(path.realpath(__file__)), path.pardir)
wiki_filename = 'simplewiki-20171103-pages-articles-multistream.xml.bz2'
wiki_path = path.join(base_dir, 'corpora', wiki_filename)
outname = path.join(base_dir, 'corpora', 'simplewikiselect')

index = []  # Save information about articles as they've been processed.

wiki = WikiCorpus(wiki_path, dictionary=True)  # dict=True avoids making vocab
wiki.metadata = True  # Want article titles
print("Loading Wikipedia archive (this may take a few minutes)... ", end="")
articles = list(wiki.get_texts())
print("Done.")

num_articles = len(articles)

print("Total Number of Articles:", num_articles)

MAX_WC = 20_000_000
ARTICLE_MIN_WC = 200
ARTICLE_MAX_WC = 10000

ac = 0
wc = 0
selected = []

with open(outname + ".txt", "w") as f:
Example #38
0
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
import multiprocessing
import logging

# enable logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s",
                    datefmt='%H:%M:%S',
                    level=logging.INFO)

# load Wiki dump file
wiki = WikiCorpus('kawiki-latest-pages-articles.xml.bz2',
                  lemmatize=False,
                  dictionary={})
sentences = list(wiki.get_texts())

# define training parameters
params = {
    'size': 200,
    'window': 10,
    'min_count': 10,
    'workers': max(1,
                   multiprocessing.cpu_count() - 1),
    'sample': 1E-3,
    'iter': 5,
    'sg': 1,
    'hs': 1
}

# train and save word2vec model
word2vec = Word2Vec(sentences, **params)
Example #39
0
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import Word2Vec

wiki = WikiCorpus('/Users/pavel/PycharmProjects/NeuralNetworkWithTenser/src/wordrecognition/res/ruwiki-20190120-pages-articles-multistream1.xml-p4p204179.bz2', dictionary=False)
print('bigram')
bigram = Phrases(wiki.get_texts())
print('bigram_transformer')
bigram_transformer = Phraser(bigram)


def text_generator_bigram():
    for text in wiki.get_texts():
        yield bigram_transformer[[word for word in text]]


trigram = Phrases(text_generator_bigram())
print('trigram')
trigram_transformer = Phraser(trigram)
print('trigram_transformer')


def text_generator_trigram():
    for text in wiki.get_texts():
        yield trigram_transformer[bigram_transformer[[word for word in text]]]


print('model create')
model = Word2Vec(size=100, window=7, min_count=10, workers=10, iter=1, min_alpha=0.025)
print('build_vocab')
model.build_vocab(text_generator_trigram())
class WikiAllData():
    def __init__(self,
                 corpus,
                 wiki_dict,
                 wordfile,
                 vocab_size=200000,
                 window_size=5):
        self.w2id_dict = util.load_worddict(wordfile, vocab_size)
        self.window_size = window_size

        print('Starting loading Wiki Corpus...', end='')
        wiki_d = Dictionary.load(wiki_dict)
        self.wiki_corpus = WikiCorpus(corpus, dictionary=wiki_d)
        print('[done]')

    def batch_generator(self, batch_size=128):
        QUEUE_END = '__QUEUE_END105834569xx'

        def load(q, batch_size):
            text_gen = self.wiki_corpus.get_texts()
            input_batch = np.zeros(batch_size, dtype=np.int32)
            label_batch = np.zeros((batch_size, 1), dtype=np.int32)

            counter = 0
            #w_vec : list of words in a document
            for w_vec in text_gen:
                id_vec = [
                    self.w2id_dict[w] if w in self.w2id_dict else -1
                    for w in w_vec
                ]
                init_mid = window = random.randint(2, self.window_size)

                # sliding window center go through the w_vec
                for mid_idx in range(init_mid, len(id_vec) - window - 1):
                    start_idx = max(0, mid_idx - window)
                    end_idx = min(mid_idx + window, len(id_vec) - 1)
                    # go through window
                    for target_idx in range(start_idx, end_idx + 1):
                        if target_idx == mid_idx:
                            continue
                        if id_vec[target_idx] == -1:
                            continue

                        input_batch[counter] = id_vec[mid_idx]
                        label_batch[counter] = id_vec[target_idx]

                        if counter == batch_size - 1:
                            q.put((input_batch, label_batch))
                            input_batch = np.zeros(batch_size, dtype=np.int32)
                            label_batch = np.zeros((batch_size, 1),
                                                   dtype=np.int32)
                            counter = 0
                        else:
                            counter += 1

                    window = random.randint(2, self.window_size)

            q.put(QUEUE_END)

        q = queue.Queue(maxsize=500)
        t = threading.Thread(target=load, args=(q, batch_size))
        t.daemon = True
        t.start()

        while True:
            q_output = q.get()
            if q_output == QUEUE_END:
                break
            yield q_output