def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Esempio n. 2
0
def extract_wiki(thresh, env_path, vec_file):
    program = os.path.basename(env_path[0])
    logger = logging.getLogger(program)
 
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
 
    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = " "
    i = 0
    print('--- load ck12 word2vec')
    model = gensim.models.Word2Vec.load_word2vec_format(vec_file, binary=False)
    print('--- filtering keywords based on sim to ck12 keyword science')
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        topic =[ w for w in  text[:20] if w not in stopwords.words('english')]
        sim = np.mean([ model[w].dot(model['science']) if w in model else 0 for w in topic])
        #sim = model['science'].dot(topic_vec)
        if sim > thresh:
            output.write(space.join(text) + "\n")
            i = i + 1
            if (i % 100 == 0):
                logger.info("Saved " + str(i) + " articles")    
    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
Esempio n. 3
0
def process_enwiki(input_file, output_file):
    space = ' '
    i = 0
    output = open(output_file, 'w')
    wiki = WikiCorpus(input_file, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(space.join(text) + '\n')
        i += 1
        if i % 10000 == 0:
            logger.info('Saved ' + str(i) + ' articles')
    output.close()
Esempio n. 4
0
def parse_wiki(filename):
    fout = file('../../paper/data/wiki/wiki_corpus', 'w')
    wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5)
    count = 0
    for text in wiki.get_texts():
        fout.write('%s\n' % ' '.join(text))
        if count % 10000 == 0:
            logging.info(count)
        count += 1

    fout.close()
    logging.info('Finish %d' % count)
def parse(filename):
	OUTPATH = '../gen_data/wikicorpus'
	fout = open(OUTPATH, 'w')
	wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5)
	count = 0
	for text in wiki.get_texts():
		fout.write(" ".join(text) + "\n")
		count = count + 1
		if (count % 10000 == 0):
			logging.info("Save "+str(count) + " articles")
	fout.close()
	logging.info("Finished saved "+str(count) + "articles")
Esempio n. 6
0
def process_wiki(infile, outfile):
	from gensim.corpora import WikiCorpus
	wiki = WikiCorpus(infile, lemmatize=False, dictionary={})
	i = 0
	with open(outfile, 'w') as fw:
		for text in wiki.get_texts():
			text = ' '.join(text)
			cut_text = cut(text)
			fw.write(re.sub(r' {1,}', ' ', ' '.join(cut_text)) + '\n')
			i += 1
			if i % 1000 == 0:
				logger.info('Saved ' + str(i) + ' texts')
	logger.info('Finished ' + str(i) + ' texts')
Esempio n. 7
0
def enwiki(srcPath, tarPath):
    index = 0
    space = " "    
    
    output = open(tarPath, 'w')
    wiki = WikiCorpus(srcPath, lemmatize=False, dictionary={})
    
    for text in wiki.get_texts():
        output.write(' '.join(text) + '\n')
        index += 1
        if (index % 10000 == 0):
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "\tSaved " + str(index) + " articles.")
            
    output.close()
    print("Finished saved " + str(index) + " articles.")
Esempio n. 8
0
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'):
    if not doc_set: # is empty
        return
    wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary')
    wiki.metadata = True  # request to extract page_id and title
    
    num_docs_found = 0
    batch_dict = {}
    NNZ = 0
    batch = artm.messages_pb2.Batch()
    for (text, page_id_and_title) in wiki.get_texts():
        page_id = page_id_and_title[0]
        title = page_id_and_title[1]

        if page_id in doc_set:
            num_docs_found += 1
            print num_docs_found, page_id, title

            # get tokens tf in the text
            text_tf = Counter(text)
            for token in text:
                # update batch dictionary
                if token not in batch_dict:
                    batch.token.append(unicode(token, 'utf-8'))
                    batch_dict[token] = len(batch.token) - 1

            # add item to batch
            item = batch.item.add()
            item.id = int(page_id)
            item.title = title
            field = item.field.add()
            field.name = lang
            for token in text_tf:
                field.token_id.append(batch_dict[token])
                field.token_count.append(text_tf[token])
                NNZ += text_tf[token]
       
            if len(batch.item) == batch_size:
                artm.library.Library().SaveBatch(batch, batch_path)
                print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)

                batch = artm.messages_pb2.Batch()
                batch_dict = {}
                NNZ = 0

    if len(batch.item) > 0:
        artm.library.Library().SaveBatch(batch, batch_path)
        print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)
Esempio n. 10
0
def dataprocess(_config):
    i = 0
    output = None
    if six.PY3:
        output = open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w')
    else:
        output = codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w')
    wiki = WikiCorpus(os.path.join(_config.data_path, _config.zhwiki_bz2), lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        if six.PY3:
            output.write(b' '.join(text).decode('utf-8', 'ignore') + '\n')
        else:
            output.write(' '.join(text) + '\n')
        i += 1
        if i % 10000 == 0:
            print('Saved ' + str(i) + ' articles')
    output.close()
    print('Finished Saved ' + str(i) + ' articles')
Esempio n. 11
0
def process_wiki(inp, outp):
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)

    i = 0

    output = open(outp, 'w', encoding='utf-8')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(b' '.join(text).decode('utf-8') + '\n')
        i = i + 1
        if i % 10000 == 0:
            logger.info('Saved ' + str(i) + ' articles')

    output.close()
    logger.info('Finished ' + str(i) + ' articles')
Esempio n. 12
0
def my_function():
    space = ' '
    i = 0
    l = []
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    f = open('./data/reduce_zhiwiki.txt', 'w')
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = Converter('zh-hans').convert(temp_sentence)
            seg_list = list(jieba.cut(temp_sentence))
            for temp_term in seg_list:
                l.append(temp_term)
        f.write(space.join(l) + '\n')
        l = []
        i = i + 1

        if (i %200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()
Esempio n. 13
0
def process_wiki(inp,outp):
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    space = " "
    i = 0
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(space.join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
Esempio n. 14
0
def zhwiki2chars(in_file, out_file):
    reg = re.compile(r'^[a-zA-Z]+$')

    def _isalpha(string):
        return reg.match(string) is not None

    i = 0
    out = open(out_file, 'w')
    wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
    for article in wiki.get_texts():
        tokens = []
        for token in article:
            token = token.decode("utf-8").strip()
            if _isalpha(token):
                continue
            tokens.append(" ".join(token))  # divided by character
        out.write(" ".join(tokens) + "\n")
        i += 1
        if i % 10000 == 0:
            print("process %d articles" % i)
    out.close()
def make_wiki_corpus(inp, outp, logger):
    '''
    Предобработка википедии.
    :param inp: путь к файлу, например: enwiki-20150304-pages-articles.xml.bz2
    :param outp: выходной текстовый файл с предобработанной базой текстов
                 например: wiki.en.text
    :param logger: логер для вывода информации о процессе предобработки
    '''
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})

    i = 0
    space = " "
    for text in wiki.get_texts():
        output.write(space.join(text) + "\n")
        i += 1
        if i % 10000 == 0:
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
Esempio n. 16
0
def main():
    gensim.corpora.wikicorpus.tokenize = replacement_tokenize

    infn, outfn = sys.argv[1:3]
    wiki = WikiCorpus(infn, lemmatize=False, dictionary={})
    with open(outfn, 'w') as outfile:
        for i, article in enumerate(wiki.get_texts()):
            article = [entry.decode("utf-8") for entry in article]
            text = " ".join(article)
            mostly_sentences = nltk.sent_tokenize(text)

            sentences = []
            for sent in mostly_sentences:
                for line in sent.splitlines():
                    sentences.append(line.strip())

            for sentence in sentences:
                sentence = cleanup(sentence)
                if sentence:
                    print(sentence, file=outfile)
            if (i % 10000 == 0):
                print("Saved ", i, "articles")
Esempio n. 17
0
def preprocess():
    space = ''
    i = 0
    l = []
    zhwiki_name = './data/***.xml.bz2'
    f = open('./data/***.txt', 'w')
    wiki = WikiCorpus(zhwiki_name, lemmatize='None',
                      dictionary={})  # xml文件中的训练语料
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = Converter('zh_hans').convert(
                temp_sentence)  # 繁体转为简体
            seg_list = list(jieba.cut(temp_sentence))
            for term in seg_list:
                l.append(term)
        f.write(space.join(l) + 'n/')
        i = i + 1
        l = []

        if (i % 200) == 0:
            print("saved" + str(i) + "articles")

    f.close()
Esempio n. 18
0
def convert(input_path, output_path):
    logger.info("Converting Wiki Corpus...")
    corpus_path = check_path(input_path)
    wiki_text_output_path = output_path

    start_time = time.time()

    space = " "
    i = 0

    wiki = WikiCorpus(corpus_path, lemmatize=False, dictionary={})

    output = open(wiki_text_output_path, 'w')

    # Convert WikiCorpus into Text output (1 article per line)
    for text in wiki.get_texts():
        output.write(space.join(text) + '\n')
        i += 1
        if i % 10000 == 0:
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles. Time needed: " + str(time.time() - start_time))
Esempio n. 19
0
def preprocess():
    """
    使用gensim中的WikiCorpus库提取wiki的中文语料,并将繁体转成简体中文。
    然后利用jieba的分词工具将转换后的语料分词并写入一个txt
    每个wiki文档的分词结果写在新txt中的一行,词与词之间用空格隔开
    :return:
    """
    count = 0
    zhwiki_path = './data/zhwiki-latest-pages-articles.xml.bz2'
    f = open('./data/reduced_zhwiki.txt', 'w', encoding='utf8')
    wiki = WikiCorpus(zhwiki_path, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        word_list = []
        for sentence in text:
            sentence = Converter('zh-hans').convert(sentence)  # 繁体转简体
            seg_list = jieba.cut(sentence)
            for seg in seg_list:
                word_list.append(seg)
        f.write(' '.join(word_list) + '\n')
        count += 1
        if count % 200 == 0:
            print("Saved " + str(count) + ' articles')

    f.close()
Esempio n. 20
0
def main(args):
    """
    args: argparse.Namespace object

    Returns: None
    """
    logger = logging.getLogger(__name__)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)

    # load wiki corpus from a .xml.bz2 file

    wiki = WikiCorpus(args.infile,
                      lemmatize=False,
                      processes=multiprocessing.cpu_count())

    # parse documents from the corpus and write to the output file
    with open(args.outfile, 'w', encoding='utf-8') as fout:
        for i, text in enumerate(wiki.get_texts()):
            fout.write(' '.join(text) + '\n')
            if (i + 1) % 10000 == 0:
                logger.info('Processed %d documents' % (i + 1))
            cnt = i
    logger.info('Finished processing %d documents' % cnt)
Esempio n. 21
0
def process_wiki():
    import logging
    import os.path
    import sys

    from gensim.corpora import WikiCorpus
    print(__name__)
    print("running %s" % ' '.join(sys.argv))

    # if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print((globals()['__doc__'] % locals()))
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = b' '
    i = 0

    output = open(outp, 'w', encoding='utf-8')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        list1 = space.join(text)
        output.write((list1.decode('utf-8')) + "\n")
        i += 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
Esempio n. 22
0
 def set_wiki_to_txt(self, wiki_data_path=None):
     for s in sys.argv:
         print(s)
     if wiki_data_path == None:
         # parameter
         if len(sys.argv) != 2:
             print("Please Usage: python3 " + sys.argv[0] +
                   " wiki_data_path")
             exit()
         else:
             wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
     else:
         wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
         # wiki.xml convert to wiki.txt
         with open(r'.\word2vec_data\wiki_text.txt', 'w',
                   encoding='utf-8') as output:
             text_count = 0
             for text in wiki_corpus.get_texts():
                 # save use string(gensim)
                 output.write(' '.join(text) + '\n')
                 text_count += 1
                 if text_count % 10000 == 0:
                     logging.info("目前已處理 %d 篇文章" % text_count)
                 print("轉檔完畢!")
def my_function():
    space = ' '
    i = 0
    l = []
    zhwiki_name = './study_ml/data/text_vector/zhwiki-latest-pages-articles.xml.bz2'
    f = open('./study_ml/data/text_vector/reduce_zhiwiki.txt', 'w')
    # 维基百科语料将xml的wiki数据转换为text格式
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        for temp_sentence in text:
            # 繁体字转换
            temp_sentence = langconv.Converter('zh-hans').convert(
                temp_sentence)
            # 分词
            seg_list = list(jieba.cut(temp_sentence))
            for temp_term in seg_list:
                l.append(temp_term)
        f.write(space.join(l) + '\n')
        l = []
        i = i + 1

        if (i % 200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()
Esempio n. 24
0
if __name__ == '__main__':

    # set up logging
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running: %s" % ' '.join(sys.argv))

    # check and process input arguments
    args = parse_args(sys.argv[1:])

    if not 'input' in args:
        logger.error("No input given!")
        sys.exit(1)

    # get args
    inp, outp, limit = args['input'], args['output'], args['limit']

    # prepare corpus
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    texts = slice(wiki.get_texts(), limit);

    # save this for efficiency
    space = " "
    output = open(outp, 'w')
    iterate_with_logging(logger, 10000, texts,
                 lambda text: output.write(space.join(text) + "\n"))

    output.close()
Esempio n. 25
0
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# Specify data path
data_path = '/data/khgkim/compling/dump'
token_path = '/data/khgkim/compling/word2vec_tokens.txt'
analogy_path = '/data/khgkim/compling/questions-words.txt'

os.chdir(data_path)

if not (os.path.isfile(token_path)):
    # Extract and tokenize Wikipedia articles
    wiki_corpus = WikiCorpus('wiki_dump.xml.bz2')
    wiki_lines = wiki_corpus.get_texts()

    # Write wiki_lines out for future use
    lines_output = open(token_path, 'w')
    for text in wiki_lines:
        lines_output.write(" ".join(text) + "\n").encode('utf-8')
    lines_output.close()
else:
    print 'Output message: word2vec_tokens.txt already exists!'
    exit()

model = Word2Vec(sentences=LineSentence(wiki_lines),
                 size=400,
                 negative=5,
                 hs=0,
                 sample=1e-5,
Esempio n. 26
0
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec


wiki_corpus = WikiCorpus("dewiki-latest-pages-articles.xml.bz2", dictionary={None: None})

normal_window_model = Word2Vec(window=5)
normal_window_model.build_vocab(wiki_corpus.get_texts())
normal_window_model.train(wiki_corpus.get_texts(),
                          total_examples=normal_window_model.corpus_count,
                          epochs=normal_window_model.epochs)
normal_window_model.save("normal_window_model")

small_window_model = Word2Vec(window=2)
small_window_model.build_vocab(wiki_corpus.get_texts())
small_window_model.train(wiki_corpus.get_texts(),
                         total_examples=small_window_model.corpus_count,
                         epochs=small_window_model.epochs)
small_window_model.save("small_window_model")

Esempio n. 27
0
# -*- coding: utf-8 -*-
"""
    Código fuente de ejemplos y ejercicios del libro
    "Curso de Programación Python"
    (C) Ediciones Anaya Multimedia 2019

    Autores: Arturo Montejo Ráez y Salud María Jiménez Zafra
"""
from gensim.corpora import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.utils import deaccent

# Leemos el volcado descargado de Wikipedia
corpus = WikiCorpus('eswiki-latest-pages-articles.xml.bz2', dictionary=False)

# Quitamos tildes
texts = [deaccent(t) for t in corpus.get_texts()]

# Definimos el algoritmo a utilizar y sus hiperparámetros
model = Word2Vec(size=400, window=5, min_count=5)

# Generamos el vocabulario
model.build_vocab(texts)

# Entrenamos el modelo
model.train(texts, chunksize=500)

# Lo guardamos en disco para su uso posterior
model.save('eswikipedia_w2v_model')
Esempio n. 28
0
#_*_coding:utf-8_*_

from gensim.corpora import WikiCorpus
import jieba
from langconv import *
import codecs
from tqdm import tqdm, trange
import time, datetime

start = datetime.datetime.now()
zhwiki = '/NLP/data/zhwiki-latest-pages-articles.xml.bz2'
strs = []
i = 0
f = codecs.open('./zhiwiki.txt', 'a', 'utf-8')
wiki = WikiCorpus(zhwiki, lemmatize=False, dictionary={})
for text in tqdm(wiki.get_texts()):
    for sen in text:
        sen = Converter('zh-hans').convert(sen)
        sen_list = list(jieba.cut(sen))
        for s in sen_list:
            strs.append(str(s))
    tmp = ' '.join(strs)
    f.write(tmp + '\n')
    strs = []
    i = i + 1

    if (i % 200 == 0):
        print('save' + str(i) + 'article')
f.close()
end = datetime.datetime.now()
print((end - start).seconds)
Esempio n. 29
0
    reload(sys)
    sys.setdefaultencoding('utf-8')
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) != 3:
        print("Using: python wiki_process.py zhwiki.xxx.xml.bz2 wiki.en.text")
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        if six.PY3:
            output.write(b' '.join(text).decode('utf-8') + '\n')
        else:
            output.write(space.join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
Esempio n. 30
0
    program = os.path.basename(sys.argv[0])#得到文件名
    #program = os.path.basename()#得到文件名
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)

    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w',encoding='utf-8')
    wiki =WikiCorpus(inp, lemmatize=False, dictionary=[])#gensim里的维基百科处理类WikiCorpus
    for text in wiki.get_texts():#通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
        output.write(space.join(text) + "\n")
        i = i+1
        if (i % 10000 == 0):
            logger.info("Saved "+str(i)+" articles.")

    output.close()
    logger.info("Finished Saved "+str(i)+" articles.")
    



Esempio n. 31
0
from gensim.corpora import WikiCorpus

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals())
        print(locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():  # 一篇文章一篇文章的获取
        output.write(space.join(text) + "\n")

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
Esempio n. 32
0
# Отфильтруем слова, встречающиеся слишком редко и слишком часто
data.dictionary.filter_extremes(no_below=10, no_above=0.15)
words_count = len(data.dictionary)
print(words_count)

word2id = dict()
for elem in data.dictionary:
    word2id.update({data.dictionary[elem]: elem})

id2word = dict()
for elem in data.dictionary:
    id2word.update({elem: data.dictionary[elem]})

# In[4]:

sentences = list(data.get_texts())

# ### 2. LSA (Latent semantic analysis)
# This solution uses full document as a context of word. So, we have some vocabulary $W$ and a set of documents $D$. Matrix $X$ with shape $|W| \times |D|$ at position $w, d$ stores importance of word $w$ for document $d$. If word $w$ is not found in the document $d$ than at appropriate position $X$ has 0 (obviously, matrix is sparse).
#
# For each matrix you can find [SVD decomposition](https://en.wikipedia.org/wiki/Singular_value_decomposition)
# $$X = U \Sigma V^{T} \text{, где }$$
# * $U$ – orthogonal matrix $|W| \times |W|$ of left singular vectors
# * $\Sigma$ – diagonal matrix $|W| \times |D|$ of singular values
# * $V$ – orthogonal matrix $|D| \times |D|$  of right singular vectors
#
# Let's suppouse that row $w$ in matrix $U\Sigma$ is a vector that represents word $w$, and row $d$ of $V$ coresponds to document $d$. In some sense we already found the embeddings of words and documents at the same time. But size of vectors are determined by documents number $|D|$.
#
# Nevertheless you can use truncated SVD instead
# $$ X \approx X_k = U_k \Sigma_k V^{T}_k \text{, where }$$
# * $U_k$ – $k$ left singular vectors
Esempio n. 33
0
#encoding: utf-8
from gensim.corpora import WikiCorpus
import codecs
import os

path_for_save_resault = '/home/ubuntu/Documents/hw_background_gene/'

wiki_jpn = WikiCorpus(
    '/home/ubuntu/Documents/hw_background_gene/jawiki-latest-pages-articles.xml.bz2'
)

with codecs.open(os.path.join(path_for_save_resault, "wiki_jpn.txt"), "w",
                 'utf-8') as output:
    for i in wiki_jpn.get_texts():
        output.write('\n'.join(i).decode('utf-8'))
Esempio n. 34
0
 def generate_wiki_corpus(self):
     wiki_corpus = WikiCorpus(self.wikidump_filename, dictionary={})
     with open(self.output_text_filename,'w',encoding='utf-8') as output:
         for text in wiki_corpus.get_texts():
             output.write(' '.join(text) + '\n')
Esempio n. 35
0
corpus = WikiCorpus('../fawiki-latest-pages-articles.xml.bz2',dictionary=False)

max_sentence = -1

def generate_lines():
    for index, text in enumerate(corpus.get_texts()):
        if index < max_sentence or max_sentence==-1:
            yield text
        else:
            break

# Check if model is not exist
model = Word2Vec() 		
if ((os.path.exists('../model_farsi')) and (os.path.isfile('../model_farsi'))):
	model = Word2Vec.load('../model_farsi')
	result_1 = model.most_similar('روز')
	result_2 = model.most_similar(positive=['زن', 'پادشاه'], negative=['مرد'], topn=10)
	
	print "result is:"
	for (re,v) in result_1:
		print re + ' '+ str(v)
	print "======================="
	for (re,v) in result_2:
		print re + ' '+ str(v)


else:
	model.build_vocab(corpus.get_texts()) 
	model.train(generate_lines(),chunksize=500)
	model.save('../model_farsi')
"""

# pip intall gensim

from gensim.corpora import WikiCorpus
import time

start_time = time.time()

# Creates an Empty file to dump data.
 
target = open('Wiki_Data.txt', 'w')
wiki_data = WikiCorpus('enwiki-latest-pages-articles15.xml-p7744803p9244803.bz2')

i = 0
for text in wiki_data.get_texts():
    target.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
    i = i + 1
    if (i % 10000 == 0):
        print('Extracted ' + str(i) + ' articles')
target.close()

print(" Data Extraction Completed in %d seconds!" %(time.time() - start_time))

"""
Extracted 10000 articles
Extracted 20000 articles
Extracted 30000 articles
Extracted 40000 articles
Extracted 50000 articles
Extracted 60000 articles
Esempio n. 37
0
#!/usr/bin/python

from gensim.corpora import WikiCorpus
from gensim.models.word2vec import Word2Vec

corpus = WikiCorpus('dewiki-latest-pages-articles.xml.bz2', dictionary=False, lemmatize=False)

model = Word2Vec(size=300, window=7, min_count=7, workers=4, negative=10, hs=0)
model.build_vocab(corpus.get_texts())
model.train(corpus.get_texts())
model.init_sims(replace=True)
model.save('dewiki.w2v')
Esempio n. 38
0
    Config = ConfigParser.ConfigParser()

    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]

    if not os.path.isdir(os.path.dirname(outp)):
        raise SystemExit("Error: The output directory does not exist. Create"
                         "the directory and try again.")

    # create the dictionary containing document frequencies for each token
    wiki = WikiCorpus(inp, lemmatize=False, dictionary=Dictionary())
    wiki.dictionary = Dictionary(wiki.get_texts(), prune_at=None)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')

    # create the configuration file with default values
    config_file = open(outp + '_wikifinder.cfg', 'w')
    Config.add_section('general')
    Config.set('general', 'articlecount', wiki.length)
    Config.set('general', 'wordids_path', outp + '_wordids.txt.bz2')
    Config.set('general', 'bing_api_key', 'none')
    Config.add_section('citation-needed')
    Config.set('citation-needed', 'Citation needed', 'true')
    Config.set('citation-needed', 'Cn', 'true')
    Config.set('citation-needed', 'Fact', 'true')
    Config.set('citation-needed', 'Cb', 'true')
    Config.set('citation-needed', 'Ctn', 'true')
    Config.set('citation-needed', 'Ref?', 'true')
Esempio n. 39
0
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    wiki = WikiCorpus(inp, lemmatize=True)
    wiki.metadata = True  # Ensure doc id is captured

    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)

    # Save the document ids to titles as a dictionary -- this will take a long time
    # Also may be unnessesary if metadata works correctly
    docmap = {}
    for index, doc in enumerate(wiki.get_texts()):
        docmap[index] = doc[1][1]
    with bz2.BZ2File('doc_index.pickle.bz2', 'w') as f:
        pickle.dump(docmap, f)

    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm',
                       wiki,
                       progress_cnt=10000,
                       metadata=True)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')

    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
Esempio n. 40
0
def make_wiki(wiki_dump_path, wiki_text_path):
    wiki = WikiCorpus(wiki_dump_path)
    with open(wiki_text_path, 'w', encoding='utf-8') as fout:
        for text in tqdm(wiki.get_texts()):
            fout.write(' '.join(text) + '\n')
Esempio n. 41
0
import os.path
import sys

from gensim.corpora import WikiCorpus
from gensim.models import TfidfModel, Word2Vec

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])

    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)

    inp, outp = sys.argv[1:3]

    wiki = WikiCorpus(inp, dictionary={})
    model = Word2Vec(size=300, window=5, min_count=5, workers=8)
    sentences = wiki.get_texts()
    model.build_vocab(sentences)
    sentences = wiki.get_texts()
    model.train(sentences)
    model.save(outp)
    model.init_sims(replace=True)
    model.save('trimmed-model')
logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)

parser = argparse.ArgumentParser()
parser.add_argument("-a", "--articles", help="path to enwiki-latest-pages-articles.xml.bz2")
parser.add_argument("-m", "--model", help="path to model dir")
parser.add_argument("-d", "--demo", help="path to question-words.txt analogies")
parser.add_argument("-l", "--lines", help="path to wiki-lines.txt")
args = parser.parse_args()

# Load or create wiki-lines.txt
if not (os.path.isfile(args.lines)):
    wiki_corpus = WikiCorpus(args.articles, lemmatize=False)
    wiki_lines = wiki_corpus.get_texts()

    # Write wiki_lines out for future use
    lines_file = open(args.lines, 'w')
    for text in wiki_lines:
        lines_file.write(" ".join(text) + "\n")
    lines_file.close()
else:
    wiki_lines = open(args.lines)

model = Word2Vec(
        sentences=LineSentence(wiki_lines),
        size=400,
        hs=1,
        window=5,
        min_count=5,
Esempio n. 43
0
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import urllib
import urllib.request

# скачиваем википедию
urllib.request.urlretrieve("https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2", "enwiki-latest-pages-articles.xml.bz2")


# обучаем

with open('wiki.en.text', 'w') as fout:
    wiki = WikiCorpus(f, lemmatize=False, dictionary={})
    for i, text in enumerate(wiki.get_texts()):
        fout.write(' '.join(text) + '\n')
        if i == 99999:
            sys.exit()


model = Word2Vec(LineSentence('wiki.en.text'), size=200, window=5, min_count=3, workers=8)
# trim unneeded model memory = use (much) less RAM
model.init_sims(replace=True)
model.save('wiki.en.word2vec.model')


# тестируем модель
model.most_similar(’queen’, topn=3)


model.most_similar(positive=[’woman’, ’king’],
Esempio n. 44
0
if __name__ == "__main__":  # if the program is being run directly and is not being imported...
	program = os.path.basename(sys.argv[0])
	logger = logging.getLogger(program)

	logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
	logging.root.setLevel(level=logging.INFO)
	logger.info("running %s" % ' '.join(sys.argv))

	if len(sys.argv) != 3:
		print("Use python3 ExtractArticles.py enwiki.xxx.xml.bz2 wikien.txt")
		sys.exit(1)  # exits from python

	inp, outp = sys.argv[1:3]
	space = " "
	i = 0

	output = open(outp, "w", encoding="utf-8")

	wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
	count = 0
	for text in wiki.get_texts():  # wiki.get_texts() is generator object
		output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
		i = i + 1
		if (i % 10000 == 0):
			logger.info("Saved", str(i), "articles")

	output.close()
	logger.info("Finished saving", str(i), "articles")

Esempio n. 45
0
import logging
from gensim.corpora import WikiCorpus
logging.info("Love Live!")
wiki_corpus = WikiCorpus('zhwiki-20190520-pages-articles-multistream.xml.bz2',
                         dictionary={})
texts_num = 0
with open("wiki_texts.txt", 'w', encoding='utf-8') as output:
    for text in wiki_corpus.get_texts():
        output.write(''.join(text) + '\n')
        texts_num += 1
        if texts_num % 10000 == 0:
            logging.info("Processed %d articles" % texts_num)
Esempio n. 46
0
 def set_wiki_to_txt(self):
     wiki_corpus = WikiCorpus(self.wiki_data_path, dictionary={})
     with open(self.save_text_path, 'w', encoding='utf-8') as output:
         for text in wiki_corpus.get_texts():
             output.write(' '.join(text) + '\n')
         print("转档完成!")
Esempio n. 47
0
from gensim.corpora import WikiCorpus

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])  #得到文件名
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)

    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False,
                      dictionary=[])  #gensim里的维基百科处理类WikiCorpus
    for text in wiki.get_texts(
    ):  #通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
        output.write(space.join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles.")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles.")
Esempio n. 48
0
from gensim.corpora import WikiCorpus
import time


# Extract edu_duct
edu_dict = set()
with open('datas/dict/edu_dict.txt', 'r') as f:
    edu_dict.update([line.strip('\n') for line in f])


# Extract articles + convert to traditional chinese

# !! Warning !!
# Below code will replace original result and run roughly 20 minutes
wiki_corpus = WikiCorpus('datas/raw/zhwiki-20170801-pages-articles.xml.bz2', dictionary=edu_dict)
with open('datas/wiki-texts.txt', 'w', encoding='utf-8') as output:
    start_time = time.time()
    for i, text in enumerate(wiki_corpus.get_texts()):
        output.write(' '.join(text) + '\n')
        if i % 1000 == 0:
            print('Finished %3dk lines / elapsed time %10.2f' % (i/1000, time.time() - start_time), end='\r')
Esempio n. 49
0
这个代码是将从网络上下载的xml格式的wiki百科训练语料转为txt格式
wiki百科训练语料
    链接:https://pan.baidu.com/s/1eLkybiYOE_aVxsN0pALATg
    密码:hmtn
"""

from gensim.corpora import WikiCorpus

if __name__ == '__main__':

    print('主程序开始...')

    input_file_name = 'zhwiki-latest-pages-articles.xml.bz2'
    output_file_name = 'wiki.cn.txt'
    print('开始读入wiki数据...')
    input_file = WikiCorpus(input_file_name, lemmatize=False, dictionary={})
    print('wiki数据读入完成!')
    output_file = open(output_file_name, 'w', encoding="utf-8")

    print('处理程序开始...')
    count = 0
    for text in input_file.get_texts():
        output_file.write(' '.join(text) + '\n')
        count = count + 1
        if count % 10000 == 0:
            print('目前已处理%d条数据' % count)
    print('处理程序结束!')

    output_file.close()
    print('主程序结束!')
     # wiki.get_texts() will only return articles which pass a couple 
     # filters that weed out stubs, redirects, etc. If you included all of
     # those, Wikpedia is more like ~17M articles.
     #
     # For each article, it's going to add the words in the article to the 
     # dictionary.
     # 
     # If you look inside add_documents, you'll see that it calls doc2bow--
     # this generates a bag of words vector, but we're not keeping it. The
     # dictionary isn't finalized until all of the articles have been
     # scanned, so we don't know the right mapping of words to ids yet.
     #
     # You can use the prune_at parameter to prevent the dictionary from
     # growing too large during this process, but I think it's interesting
     # to see the total count of unique tokens before pruning.
     dictionary.add_documents(wiki.get_texts(), prune_at=None)            
                     
     print '    Building dictionary took %s' % formatTime(time.time() - t0)
     print '    %d unique tokens before pruning.' % len(dictionary)
     sys.stdout.flush()
     
     keep_words = 100000    
 
     # The initial dictionary is huge (~8.75M words in my Wikipedia dump), 
     # so let's filter it down. We want to keep the words that are neither 
     # very rare or overly common. To do this, we will keep only words that 
     # exist within at least 20 articles, but not more than 10% of all 
     # documents. Finally, we'll also put a hard limit on the dictionary 
     # size and just keep the 'keep_words' most frequent works.
     wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words)
     
Esempio n. 51
0
def xml2txt(f_name, out_name):
    output = open(out_name, 'w', encoding='utf-8')
    wiki = WikiCorpus(f_name, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        str_line = ' '.join(text)
        output.write(str_line + '\n')
parser = ArgumentParser(description="Get a number of articles containing analogy words from a wikipedia dump.",
						formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("csvPath", help="The path to the csv containing the word analogies which should be contained")
parser.add_argument("wikiPath", help="The path of the wikipedia dump")
parser.add_argument("outputPath", help="The output path")
parser.add_argument("--n-articles", help="The number of articles", type=int, default=1000)
args = parser.parse_args()


if args.wikiPath.endswith(".txt"):
	inp = open(args.wikiPath, "r")
	wiki_file = False
else:
	wiki = WikiCorpus(args.wikiPath, lemmatize=False, dictionary={})
	inp = wiki.get_texts()
	wiki_file = True

try:
	with open(args.csvPath) as csvfile:
		word_analogies = [row for row in csv.reader(csvfile, delimiter=",")]
		remaining_n_articles = args.n_articles
		with open(args.outputPath, "w") as out:
			for text in inp:
				for word_analogy in word_analogies:
					if word_analogy[0] in text and word_analogy[1] in text:
						if wiki_file:
							out.write(" ".join(text) + "\n")
						else:
							out.write(text)
						remaining_n_articles -= 1
Esempio n. 53
0
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
 
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
 
    # check and process input arguments
    if len(sys.argv) < 3:
        print "Usage: extractwiki.py infile_name outfile_name"
        sys.exit(1)
        
    infilename, outfilename = sys.argv[1:3]
 
    if os.path.isfile(outfilename):
        logger.error("Output file %s exists. Change the file name and try again." %outfilename)
        sys.exit(1)
        
    i = 0
    output = open(outfilename, 'w')
    wiki = WikiCorpus(infilename, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write( " ".join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")
 
    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
    
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    # if len(sys.argv) < 3:
    #     print(globals()['__doc__'] % locals())
    #     sys.exit(1)
    # python process_wiki.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
    # inp, outp = sys.argv[1:3]
    # set the input and output filenames
    inp, outp = '/home/hs/Data/wikipedia/zhwiki-latest-pages-articles.xml.bz2', '/home/hs/Data/wikipedia/wiki.zh.text'

    space = " "
    i = 0
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    texts = wiki.get_texts()
    for text in texts:
        # print((text[0]).decode("utf-8"))
        # exit()
        output.write(space.join([t.decode('utf-8') for t in text]) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")