Ejemplo n.º 1
0
def update_model(subreddit, text_array):
    debug = False
    #Path to the dictionary
    path_to_dict = r'../models/wikidump_wordids/wikidump_wordids.txt'

    #Path to the folder full of models pertaining to certain subreddits
    path_to_models_folder = '../models/' + subreddit + '/'

    #Path to where we're gonna save the model
    path_to_model = path_to_models_folder + 'model.model'
    path_to_load = path_to_models_folder + 'model.model'
    #Load model
    model = gensim.models.LdaModel.load(path_to_load)

    #Loading in dictionary
    dct = Dictionary()
    dct.add_documents([text_array])
    id2word = dct
    #Cleaning up the text
    common_corpus = [clean_text(text) for text in text_array]

    #Debug Print
    if debug:
        print(common_corpus)

    #Training the new model
    model.update(common_corpus)

    #Save the new model
    model.save(path_to_model)
    def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary
 class MyCorpus(object):
     def __init__(self, input_file, K):
         self.K = K
         self.input_file = input_file
         self.dictionary = Dictionary()
         with open(input_file, "rt") as f:
             for line in f:
                 self.dictionary.add_documents([line.split()])
         self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K)
                 
     def __iter__(self):
         count = 1
         with open(self.input_file, "rt") as f:
             count += 1
             for line in f:
                 yield self.dictionary.doc2bow(line.rstrip().split())
                 
     def __str__(self):
         s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, "
         s += str(len(self.dictionary.keys())) + " features, "
         s += str(corpus.dictionary.num_nnz) + " non-zero entries)"
         return s
         
     def __repr__(self):
         return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
Ejemplo n.º 4
0
class CorpusMUC(corpora.TextCorpus):
    def __init__(self):
        super(TextCorpus, self).__init__()
        self.stopwords = NLTKStopwords.words('english')
        self.stopwords.extend(['``', ',', '(', ')', '.'])
        self.msgs = MUCmessages()
        self.dictionary = Dictionary()
        self.dictionary.add_documents(self.get_texts())

    def get_texts(self):
        """
        Parse documents from the .cor file provided in the constructor. Lowercase
        each document and ignore some stopwords.

        .cor format: one document per line, words separated by whitespace.
        """
        for doc in self.msgs:
            document = [
                word for word in [
                    word_tokenize(sentence) for sentence in wordpunct_tokenize(
                        doc[1]['content'].lower())
                ]
            ]
            yield [
                str(word[0]) for word in document
                if str(word[0]) not in self.stopwords
            ]

    def __len__(self):
        """Define this so we can use `len(corpus)`"""
        if 'length' not in self.__dict__:
            logger.info(
                "caching corpus size (calculating number of documents)")
            self.length = sum(1 for doc in self.get_texts())
        return self.length
 def __init__(self, movie_tags):
     """
         movie_tags: dict {item_id => tags}
     """
     self.item_vectors = {}
     self.id_to_idx = {}
     self.idx_to_id = []
     dictionary = Dictionary()
     for item, tags in movie_tags.items():
         dictionary.add_documents([tags])
     for item, tags in movie_tags.items():
         self.item_vectors[item] = dictionary.doc2bow(tags)
     data = []
     row_ind = []
     col_ind = []
     i = 0
     for item, tags in self.item_vectors.items():
         for (col, count) in tags:
             data.append(count)
             row_ind.append(i)
             col_ind.append(col)
         self.id_to_idx[item] = i
         self.idx_to_id.append(item)
         i += 1
     self.item_vectors = csr_matrix((data, (row_ind, col_ind)),
                                    shape=(i, dictionary.num_pos))
Ejemplo n.º 6
0
def create_dictionary(doc_dict):
    # create gensim dictionary by using python dictionary as input
    dic = Dictionary()
    for doc in doc_dict:
        dic.add_documents([[word for (word, _) in doc_dict[doc]]])
    dic.save(dic_path)
    return dic
Ejemplo n.º 7
0
    def _generate_vocabulary(self):
        vocab = Dictionary()
        session = DBSession()

        i = 0
        for question in session.query(Question).yield_per(self.yield_per):
            i += 1
            if i % self.print_per == 0:
                logger.info('Processed %d / %d questions :: %d unique tokens' % (i, self.n_questions, vocab.num_docs))

            strings = [question.title, question.content] if question.content is not None else [question.title]
            vocab.add_documents([CorpusDictionary.tokenize(s) for s in strings])

        i = 0
        for answer in session.query(Answer).yield_per(self.yield_per):
            i += 1
            if i % self.print_per == 0:
                logger.info('Processed %d / %d answers :: %d unique tokens' % (i, self.n_answers, vocab.num_docs))

            vocab.add_documents([CorpusDictionary.tokenize(answer.content)])

        # commit and close the session
        session.commit()
        session.close()

        return vocab
def dic_creation(ids):
    for phase in ['train', 'val']:
        print(phase)
        with open('data/preprocessed_{}.pickle'.format(phase),
                  'rb') as preprocessed:
            preprocessed_dict = pickle.load(preprocessed)
            article_txts = []
            for id_ in tqdm(ids[phase]):
                article_txts.append(preprocessed_dict[id_]['article_text'])
#             article_txts = list(map(lambda id_: preprocessed_dict[id_]['article_text'], ids[phase]))
            print("loaded text")
            article_tokens = []
            for txt in tqdm(article_txts):
                article_tokens.append(word_tokenize(txt))


#             article_tokens = list(map(lambda txt: word_tokenize(txt), article_txts))
            print("tokenized")

        if phase == 'train':
            dct = Dictionary(article_tokens)
        else:
            with open('data/dict.pickle', 'rb') as handle:
                dct = pickle.load(handle)
                dct.add_documents(article_tokens)
        print("dict processed")

        # save dict
        with open('data/dict.pickle', 'wb') as handle:
            pickle.dump(dct, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print("saved")
Ejemplo n.º 9
0
def create_dic():
    txt_in = get_in()
    txt_out = get_out()
    dic = Dictionary(txt_in+txt_out+[[EXTRA]])
    dic.add_documents([[START,STOP,EXTRA]])
    dic.save(DIC_NAME)
    return dic
Ejemplo n.º 10
0
def trainModel():
    """ Train a model
    """
    if args.mode == 'Random':
        return args.topics, 0
    # need to train on dump
    files = [
        f"{args.input}/{f}" for f in os.listdir(args.input)
        if os.path.isfile(os.path.join(args.input, f))
    ]
    if args.mode == 'LDA':
        # create dictionary
        with open(files[0], "r", encoding='utf-8') as f:
            dct = Dictionary([' '.join(f.readlines()).split()])
        for filename in files[1:]:
            with open(filename, "r", encoding='utf-8') as f:
                dct.add_documents([' '.join(f.readlines()).split()])
        # create corpus
        corpus = []
        for filename in files:
            with open(filename, "r", encoding='utf-8') as f:
                corpus.append(dct.doc2bow(' '.join(f.readlines()).split()))
        lda = LdaModel(corpus, num_topics=args.topics)
        lda.save("./models/LDAdump.model")
        dct.save("./models/LDAdump.dct")
        return lda, dct
    if args.mode == 'loadLDA':
        return LdaModel.load("./models/LDAdump.model"), Dictionary.load(
            "./models/LDAdump.dct")
Ejemplo n.º 11
0
def preprocess(segments, dct=None, bigram=None):
    processed_segments = []
    for seg in segments:
        processed_seg = []
        for word in seg:
            if True in [word.is_space, word.is_stop, word.is_punct]:
                continue
            word = word.lemma_
            word = word.lower()
            processed_seg.append(word)
        processed_segments.append(processed_seg)

    if bigram is None:
        phrases = Phrases(processed_segments, min_count=3, threshold=3)
        bigram = Phraser(phrases)

    processed_segments = bigram[processed_segments]

    if dct is None:
        dct = Dictionary(processed_segments)
    else:
        dct.add_documents(processed_segments)

    return [dct.doc2bow(line)
            for line in processed_segments], dct, processed_segments, bigram
Ejemplo n.º 12
0
def build_tag_vectors(tag_directory_path):
    """Loads tag files, builds sparse vectors for each song
        Parameters
        ----------
            tag_directory_path : String, path of directory containing tags
        Returns
        -------
            id_vec_mapping : dict (song id => list[tuple(tagId, count)])
            dictionary : gensim Dictionary containing all tags and ids
    """
    dictionary = Dictionary()
    for f in listdir(tag_directory_path):
        with open(tag_directory_path+"/"+f, 'r') as tags:
            tokens = tags.read().split(sep=' ')
            dictionary.add_documents([tokens])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.compactify()
    id_vec_mapping = {}
    for f in listdir(tag_directory_path):
        song_id = f[0:-4]
        with open(tag_directory_path+"/"+f, 'r') as tags:
            tokens = tags.read().split(sep=' ')
        sparse_vec = dictionary.doc2bow(tokens)
        add_to_dictionary(id_vec_mapping, (song_id, sparse_vec))
    return id_vec_mapping, dictionary
Ejemplo n.º 13
0
class KeywordDict():
    # dictionary.bin 있으면 로드하고 없으면 새로 만들고 저장
    def __init__(self, name="default", phraser=None):
        self.name = "dictionary_" + name + ".bin"
        self.phraser = phraser
        if self.name in os.listdir(dir_dictionary):
            with open(dir_dictionary + self.name, 'rb') as dic:
                self.get_dict = pickle.load(dic)
            print("keyword dictionary loaded")
        else:
            print("dictionary not exists")
            print("start building...")
            self.build_dictionary()
            self.save()

    # 사전 만드는 함수
    def build_dictionary(self):
        self.get_dict = Dictionary()
        tickers = [
            i for i in os.listdir(dir_cleaned_news) if i.endswith(".csv")
        ]

        for ticker in tickers:
            df = pd.read_csv(dir_cleaned_news + ticker, index_col=0)
            self.get_dict.add_documents(tokenizer(df['content'], self.phraser))
            print(ticker + " added")
        print("done")

    # 저장
    def save(self):
        with open(dir_dictionary + self.name, "wb") as dic:
            pickle.dump(self.get_dict, dic)
Ejemplo n.º 14
0
def preprocess(documents,
               stem=False,
               vocab_size=10000,
               oov_token="<OOV>",
               oov_id=-1):
    """Preprocess documents.

    Args:
        documents: An array of strings, each string representing a document.
        stem: (bool) Whether to use a stemmer. Defaults to False.


    Returns:
        (gensim Dictionary, tokenized documents)
    """
    porter_stemmer = PorterStemmer()

    def process_document(doc):
        tokens = word_tokenize(doc)
        tokens = [token.lower() for token in tokens if token.isalpha()]
        if stem:
            tokens = [porter_stemmer.stem(token) for token in tokens]
        return tokens

    tokenized_docs = list(map(process_document, documents))

    dictionary = Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.8, keep_n=vocab_size)

    # Add OOV to dictionary
    dictionary.add_documents([["<OOV>"]])

    return dictionary, tokenized_docs
Ejemplo n.º 15
0
    def _generate_vocabulary(self):
        vocab = Dictionary()
        session = DBSession()

        i = 0
        for question in session.query(Question).yield_per(self.yield_per):
            i += 1
            if i % self.print_per == 0:
                logger.info('Processed %d / %d questions :: %d unique tokens' %
                            (i, self.n_questions, vocab.num_docs))

            strings = [question.title, question.content
                       ] if question.content is not None else [question.title]
            vocab.add_documents(
                [CorpusDictionary.tokenize(s) for s in strings])

        i = 0
        for answer in session.query(Answer).yield_per(self.yield_per):
            i += 1
            if i % self.print_per == 0:
                logger.info('Processed %d / %d answers :: %d unique tokens' %
                            (i, self.n_answers, vocab.num_docs))

            vocab.add_documents([CorpusDictionary.tokenize(answer.content)])

        # commit and close the session
        session.commit()
        session.close()

        return vocab
Ejemplo n.º 16
0
    def testFilterTokens(self):
        self.maxDiff = 10000
        d = Dictionary(self.texts)

        removed_word = d[0]
        d.filter_tokens([0])

        expected = {
            'computer': 0,
            'eps': 8,
            'graph': 10,
            'human': 1,
            'interface': 2,
            'minors': 11,
            'response': 3,
            'survey': 4,
            'system': 5,
            'time': 6,
            'trees': 9,
            'user': 7
        }
        del expected[removed_word]
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

        expected[removed_word] = len(expected)
        d.add_documents([[removed_word]])
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
Ejemplo n.º 17
0
def create_dictionary(dataset, texts):
    dict = Dictionary([])
    for text in texts:
        dict.add_documents([text])
    dict.save_as_text('../dataset_files/dictionary.txt')

    return dict
Ejemplo n.º 18
0
def _bow(table,
         input_col,
         add_words=None,
         no_below=1,
         no_above=0.8,
         keep_n=10000):
    word_list = table[input_col].tolist()
    dictionary = Dictionary(word_list)
    if add_words != None:
        dictionary.add_documents([add_words])
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n,
                               keep_tokens=None)

    params = {
        'Input Column': input_col,
        'Minimum Number of Occurrence': no_below,
        'Maximum Fraction of Occurrence': no_above,
        'Keep N most Frequent': keep_n
    }

    empty_description = ''
    if len(list(dictionary.dfs.values())) == 0:
        out_table = pd.DataFrame([], columns=['token', 'document_frequency'])
        empty_description = 'Out table is empty since parameter \"Minimum Number of Occurrence\" is greater than the maximum of document frequency.'
    else:
        out_table = pd.DataFrame.from_dict(dictionary.token2id,
                                           orient='index').drop([0], axis=1)
        out_table.insert(loc=0,
                         column='token',
                         value=dictionary.token2id.keys())

        token_cnt = sorted(dictionary.dfs.items(), key=operator.itemgetter(0))
        dfs_list = []
        for i in range(len(dictionary.dfs)):
            dfs_list.append(token_cnt[i][1])
        out_table['document_frequency'] = dfs_list

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
        |# Bag of Words Result
        |### Parameters
        |
        | {display_params}
        |
        | {description}
        |
        """.format(display_params=dict2MD(params),
                   description=empty_description)))

    model = _model_dict('bow')
    model['dict_table'] = out_table
    model['dictionary'] = dictionary
    model['add_words'] = add_words
    model['_repr_brtc_'] = rb.get()

    return {'model': model, 'out_table': out_table}
Ejemplo n.º 19
0
 def load_dictionary(self, filepath):
     dictionary = Dictionary() 
     with open(filepath, "rb") as f: 
         for line in f.readlines():
             # example = SampleTrainingExample(line)
             # context = example.context
             dictionary.add_documents([[word.lower() for word in line.split()]])
     return dictionary
Ejemplo n.º 20
0
def get_coherence(config, topicvec, docwords, glove_vectors):
    """Calculate UMass and w2v (GloVe) coherence scores."""

    # Clean docwords
    docs = []
    for word_list in docwords:
        doc = word_list[0]
        docs.append(doc)

    # Clean topics
    byte_topics = topicvec.printTopWordsInTopics(topicvec.docs_theta, True)
    word_topics = []
    one_word = []
    for topic in byte_topics:
        new_topic = []
        for word in topic:
            if type(word) != str:
                word = word.decode()
            new_topic.append(word)
            one = [word]
            one_word.append(one)
        word_topics.append(new_topic)

    # Get dictionary
    vocab_dict = Dictionary(docs[1:])
    # Make sure words in topics are in the dictionary
    vocab_dict.add_documents(one_word)

    # Get corpus
    corpus = [vocab_dict.doc2bow(doc) for doc in docs]

    # Calculate UMass coherence score
    # The closer to 0, the more coherent
    cm = CoherenceModel(topics=word_topics,
                        corpus=corpus,
                        dictionary=vocab_dict,
                        coherence='u_mass')
    umass_coherence = cm.get_coherence()

    # Calculate GloVe coherence score
    # Ranges between 0 and 1
    # The closer to 1, the better
    cm = CoherenceModel(topics=word_topics,
                        corpus=corpus,
                        dictionary=vocab_dict,
                        coherence='c_w2v',
                        keyed_vectors=glove_vectors)
    glove_coherence = cm.get_coherence()

    # Log coherence score and other metrics
    results_dict = OrderedDict([('num_topics', config['K']),
                                ('alpha0', config['alpha0']),
                                ('alpha1', config['alpha1']),
                                ('delta', config['iniDelta']),
                                ('umass', umass_coherence),
                                ('glove', glove_coherence)])

    return results_dict
Ejemplo n.º 21
0
def build_vocabulary_and_corpus():
    '''
    Build the vocabularies and stem sequences for each type of entities.
    '''

    # Vocabulary (same for question and answers)
    v = Dictionary()

    # Stemmer.
    stemmer = PorterStemmer()

    # Tokenizer.
    tokenizer = TweetTokenizer()

    # Read indexes
    user_index, question_index, answer_index, comment_index = read_indexes()

    # Question, answer
    q = {}
    a = {}

    # Read entities.
    with open(entity_path, 'rb') as obj:
        entities = pickle.load(obj)

    # Browse question and answers to first build vocabulary.
    for e in entities:
        # Question or answer.
        if e['type'] == 'Q' or e['type'] == 'A':
            # String content.
            title = str(e['title']).encode('utf-8').lower()
            content = str(e['content']).encode('utf-8').lower()
            # Tokenize
            d = tokenizer.tokenize(title + content)
            # Stem word
            d = [stemmer.stem(s) for s in d]
            # Process vocabulary.
            v.add_documents([d])
            # Question
            if e['type'] == 'Q':
                q[question_index[e['id']]] = d
            # Answer
            if e['type'] == 'A':
                a[answer_index[e['id']]] = d

    # Write question corpus.
    with open(os.path.join(data_path, 'q.corpus'), 'wb') as f:
        pickle.dump(q, f)

    # Write answer corpus.
    with open(os.path.join(data_path, 'a.corpus'), 'wb') as f:
        pickle.dump(a, f)

    # Write to analyse.
    v.filter_extremes(no_below=1000, keep_n=10000)
    v.compactify()
    v.save(os.path.join(data_path, "raw_vocabulary.gensim"))
Ejemplo n.º 22
0
class LoadCorpora(Component, TextCorpus):
    """
    Load corpus:
    input is an array with a list of json files
    """
    def __init__(self, input_files=None):
        """Redefine the gensim's TextCorpus init method"""

        super().__init__()

        self.input = input_files
        self.dictionary = Dictionary(prune_at=5000000)
        self.metadata = False
        if input_files is not None:
            self.dictionary.add_documents(self.get_texts(), prune_at=5000000)
        else:
            self.logger.warning(
                "No input document stream provided; assuming "
                "dictionary will be initialized some other way.")

    def get_texts(self):
        """
        Iterate through documents:
        yield each token on each document
        """

        if not isinstance(self.input, list):
            raise ConfigError('Input argument is not a List')

        for filename in self.input:  # each file
            with open(filename, 'r') as stream:
                for line in stream:  # each line
                    doc = json.loads(line)
                    yield doc['content'].split()  # split on each word

    def __iter__(self):
        """
        Iterate through documents:
        yield the bow representation of each document
        """

        if not isinstance(self.input, list):
            raise ConfigError('Input argument is not a List')

        for filename in self.input:  # each file
            with open(filename, 'r') as stream:
                for line in stream:  # each line
                    doc = json.loads(line)
                    yield self.dictionary.doc2bow(doc['content'].split())

    def save(self):
        """Override abstract method"""
        return

    def save_corpus(self, fname, corpus, id2word=None, metadata=False):
        """Override abstract method"""
        return
Ejemplo n.º 23
0
    def create_dict(self, corpus_file):
        dictionary = Dictionary();
        with open(corpus_file,"rb") as infile:
            lines = infile.readlines() #reads single line from file
		
            for line in lines:
                doc = line  #.split() #doc as bag of words (bow) of tokens in this line            
                dictionary.add_documents([doc])
        #infile.close()
        
        return dictionary
Ejemplo n.º 24
0
def test_save_load():
    dct = Dictionary()
    docs = ['一种 大头菜 自然风', '风 主要 包括 大头菜 风', '架 主要 包括 底座 支柱']
    docs_token_list = get_token_lists_of_docs(docs)
    dct.add_documents(docs_token_list)
    corpus = [dct.doc2bow(['大头菜', '风', '底座'])]
    print('corpus to save is {}'.format(corpus))
    save_path = 'resources/corpus/test_corpus.mm'

    save2disk(save_path, corpus)
    load_corpus = load_from_disk(save_path)
    print('load corpus is {}'.format(load_corpus))
Ejemplo n.º 25
0
def get_dictionary(documents: Dict[int, List[str]]) -> Dictionary:
	if os.path.exists(DICTIONARY_FILE_NAME):
		print(f"loading dictionary from {DICTIONARY_FILE_NAME}")
		gensim_dict = Dictionary.load(DICTIONARY_FILE_NAME)
	else:
		print("creating dictionary")
		gensim_dict = Dictionary()
		gensim_dict.add_documents(documents.values())
		gensim_dict.compactify()
		print(f"saving dictionary to {DICTIONARY_FILE_NAME}")
		gensim_dict.save(DICTIONARY_FILE_NAME)
	return gensim_dict
 def testFilterKeepTokens_keepn(self):
     # keep_tokens should also work if the keep_n parameter is used, but only
     # to keep a maximum of n (so if keep_n < len(keep_n) the tokens to keep are
     # still getting removed to reduce the size to keep_n!)
     d = Dictionary(self.texts)
     # Note: there are four tokens with freq 3, all the others have frequence 2
     # in self.texts. In order to make the test result deterministic, we add
     # 2 tokens of frequency one
     d.add_documents([['worda'], ['wordb']])
     # this should keep the 3 tokens with freq 3 and the one we want to keep
     d.filter_extremes(keep_n=5, no_below=0, no_above=1.0, keep_tokens=['worda'])
     expected = {'graph', 'trees', 'system', 'user', 'worda'}
     self.assertEqual(set(d.token2id.keys()), expected)
Ejemplo n.º 27
0
 def testFilterKeepTokens_keepn(self):
     # keep_tokens should also work if the keep_n parameter is used, but only
     # to keep a maximum of n (so if keep_n < len(keep_n) the tokens to keep are
     # still getting removed to reduce the size to keep_n!)
     d = Dictionary(self.texts)
     # Note: there are four tokens with freq 3, all the others have frequence 2
     # in self.texts. In order to make the test result deterministic, we add
     # 2 tokens of frequency one
     d.add_documents([['worda'], ['wordb']])
     # this should keep the 3 tokens with freq 3 and the one we want to keep
     d.filter_extremes(keep_n=5, no_below=0, no_above=1.0, keep_tokens=['worda'])
     expected = {'graph', 'trees', 'system', 'user', 'worda'}
     self.assertEqual(set(d.token2id.keys()), expected)
Ejemplo n.º 28
0
class Text2BowTransformer(TransformerMixin, BaseEstimator):
    """
    Base Text2Bow module
    """
    def __init__(self, prune_at=2000000, tokenizer=tokenize):
        """
        Sklearn wrapper for Text2Bow model.
        """
        self.gensim_model = None
        self.prune_at = prune_at
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        """
        Fit the model according to the given training data.
        """
        tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
        self.gensim_model = Dictionary(documents=tokenized_docs,
                                       prune_at=self.prune_at)
        return self

    def transform(self, docs):
        """
        Return the BOW format for the input documents.
        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # input as python lists
        check = lambda x: [x] if isinstance(x, string_types) else x
        docs = check(docs)
        tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs))
        X = [[] for _ in range(0, len(tokenized_docs))]

        for k, v in enumerate(tokenized_docs):
            bow_val = self.gensim_model.doc2bow(v)
            X[k] = bow_val

        return X

    def partial_fit(self, X):
        if self.gensim_model is None:
            self.gensim_model = Dictionary(prune_at=self.prune_at)

        tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X))
        self.gensim_model.add_documents(tokenized_docs)
        return self
Ejemplo n.º 29
0
class Text2BowTransformer(TransformerMixin, BaseEstimator):
    """
    Base Text2Bow module
    """

    def __init__(self, prune_at=2000000, tokenizer=tokenize):
        """
        Sklearn wrapper for Text2Bow model.
        """
        self.gensim_model = None
        self.prune_at = prune_at
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        """
        Fit the model according to the given training data.
        """
        tokenized_docs = [list(self.tokenizer(x)) for x in X]
        self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
        return self

    def transform(self, docs):
        """
        Return the BOW format for the input documents.
        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # input as python lists
        check = lambda x: [x] if isinstance(x, string_types) else x
        docs = check(docs)
        tokenized_docs = [list(self.tokenizer(x)) for x in docs]
        X = [[] for _ in range(0, len(tokenized_docs))]

        for k, v in enumerate(tokenized_docs):
            bow_val = self.gensim_model.doc2bow(v)
            X[k] = bow_val

        return X

    def partial_fit(self, X):
        if self.gensim_model is None:
            self.gensim_model = Dictionary(prune_at=self.prune_at)

        tokenized_docs = [list(self.tokenizer(x)) for x in X]
        self.gensim_model.add_documents(tokenized_docs)
        return self
def create_comment_dictionary(filter_every_n=1000,
                              max_iter=20,
                              from_db=True,
                              get_data_func=None):
    comment_dict = Dictionary()
    text_gen = data_preprocessor(max_iter=max_iter,
                                 from_db=from_db,
                                 get_data_func=get_data_func)
    n_iter = 0
    for _, stemmed_text, _ in text_gen:
        comment_dict.add_documents([stemmed_text])
        n_iter += 1
        if n_iter % filter_every_n == 0:
            comment_dict.filter_extremes()
    return comment_dict
Ejemplo n.º 31
0
def build_vocab():
    start = time.time()
    test_path = os.path.join(config.DATA_PATH, 'test.csv')
    train_path = os.path.join(config.DATA_PATH, 'train.csv')
    normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt')
    bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram')
    bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt')

    if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH):
        try:
            os.mkdir(config.PROCESSED_PATH)
        except OSError:
            pass

    vocab = {}

    train_df = read_file(train_path)
    test_df = read_file(test_path)
    print('tokenizing vocab file')
    texts =  np.concatenate([train_df.comment_text.fillna('N/A').values,
                             test_df.comment_text.fillna('N/A').values])


    with open(normalized_text_path, 'w') as f:
        processed_text = parallelize_dataframe(texts, tokenizer)
        for line in processed_text:
            f.write(line + '\n')
    gc.collect()
    lines = LineSentence(normalized_text_path)
    bigram = Phrases(lines)
    bigram.save(bigram_path)
    phraser = Phraser(bigram)

    with open(bigram_comments_path, 'w', encoding='utf_8') as f:
       for comment in lines:
            comm = u' '.join(phraser[comment])
            f.write(comm + '\n')

    commnets = LineSentence(bigram_comments_path)
    bigram_dict = Dictionary(commnets)
    bigram_dict.filter_extremes(no_below=config.THRESHOLD)
    bigram_dict.save_as_text(config.VOCAB_PATH)
    bigram_dict.add_documents([['<pad>']])

    with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f:
        f.write('VOCAB_SIZE = {}'.format(len(bigram_dict)))

    print('time passed: {} minutes'.format((time.time() - start) / 60))
Ejemplo n.º 32
0
def build_hdp_vec(docs, targets, dct=None, hdp=None):
    docs = [[str(o) for o in one] for one in docs]

    if dct is None:  # train set
        dct = Dictionary(docs)
        for one in docs:
            dct.add_documents([[str(o) for o in one]])

    copus = [dct.doc2bow(o) for o in docs]
    if hdp is None:  # train
        hdp = HdpModel(copus, dct)

    v = [hdp[o] for o in copus]
    v_d = matutils.corpus2dense(v, num_terms=len(dct.token2id)).T

    return copus, v_d, targets, dct, hdp
Ejemplo n.º 33
0
    def testFilterTokens(self):
        self.maxDiff = 10000
        d = Dictionary(self.texts)

        removed_word = d[0]
        d.filter_tokens([0])

        expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1,
                'interface': 2, 'minors': 11, 'response': 3, 'survey': 4,
                'system': 5, 'time': 6, 'trees': 9, 'user': 7}
        del expected[removed_word]
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

        expected[removed_word] = len(expected)
        d.add_documents([[removed_word]])
        self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
Ejemplo n.º 34
0
def get_data_tokenizer(fromdate, todate):
    print 'Starting get and save data from mysql-server into local folder....'

    fromdate = fromdate + ' 00:00:00'
    todate = todate + ' 23:59:59'

    connection = my_connection.getConnection()
    cursor = connection.cursor()

    query = 'SELECT id, vntokenizer, catid FROM news WHERE create_time BETWEEN ' + '\'' + fromdate + '\' AND \'' + todate + '\';'
    print query

    cursor.execute(query)
    rows = cursor.fetchall()
    count = 0

    token_dictionary = Dictionary()
    data = dict()

    for row in rows:
        id = row[0]
        tokenizer = row[1]
        catid = row[2]
        if tokenizer != None:
            tokenizer = tokenizer.lower()
            count += 1
            print count
            print tokenizer
            token_list = tokenizer.split(' ')
            valid_token_list = list()
            for token in token_list:
                if my_util.check_valid_token(token):
                    valid_token_list.append(token)
            token_dictionary.add_documents([valid_token_list])
            if catid == my_catid:
                data[id] = valid_token_list

    my_connection.closeConnection(connection)

    # save dictionary and data into text file
    token_dictionary.save_as_text('..' + parameter.FILE_DICTIONARY)
    fb = open('..' + parameter.FILE_DATA, 'wb')
    pickle.dump(data, fb)
    fb.close()

    print 'Done get and save data from mysql-server!'
class MyCorpus(object):
    def __init__(self, filename, max_vocab_size = 2000000):
        self.filename = filename
        self.max_vocab_size = max_vocab_size
        self.dictionary = Dictionary()
        self._build_dict()
        
    def _build_dict(self):
        with open(self.filename, "rt") as f:
            for line in f:
                doc = line.rstrip().split()
                self.dictionary.add_documents([doc])
        
    def __iter__(self):
        with open(self.filename, "rt") as f:
            for sentence in f:
                yield sentence.rstrip().split()
Ejemplo n.º 36
0
def main(path, epochs):
    with open(path) as f:
        ds = csv.DictReader(f)
        tweets = (d['tweet_text'] for d in ds)
        tweets = list(tweets)

    dct = Dictionary()
    with Pool() as pool:
        dct.add_documents(pool.map(preprocess, tweets))
    dct.filter_extremes(no_below=10, no_above=0.5)

    model = SimpleEmbedder(dct, dims=100)
    model.load_docs(tweets[-100000:])

    for i in range(int(epochs)):
        loss = model._epoch()
        print(f'Epoch {i} loss: {loss[1]}')
        np.save('embeddings', model.embeddings)
Ejemplo n.º 37
0
def compile_vocab(docs,
                  limit=1e6,
                  verbose=0,
                  tokenizer=Tokenizer(stem=None, lower=None, strip=None)):
    """Get the set of words used anywhere in a sequence of documents and assign an integer id

    This vectorizer is much faster than the scikit-learn version (and only requires low/constant RAM ?).

    >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11))
    >>> d = compile_vocab(gen, verbose=0)
    >>> d
    <gensim.corpora.dictionary.Dictionary ...>
    >>> print(d)
    Dictionary(4 unique tokens: [u'AAA', u'BBB', u'CCC', u'label'])
    >>> sorted(d.token2id.values())
    [0, 1, 2, 3]
    >>> sorted(d.token2id.keys())
    [u'AAA', u'BBB', u'CCC', u'label']
    """
    tokenizer = make_tokenizer(tokenizer)
    d = Dictionary()

    try:
        limit = min(limit, docs.count())
        docs = docs.iterator()
    except (AttributeError, TypeError):
        pass
    for i, doc in enumerate(docs):
        # if isinstance(doc, (tuple, list)) and len(doc) == 2 and isinstance(doc[1], int):
        #     doc, score = docs
        try:
            # in case docs is a values() queryset (dicts of records in a DB table)
            doc = doc.values()
        except AttributeError:  # doc already is a values_list
            if not isinstance(doc, str):
                doc = ' '.join([str(v) for v in doc])
            else:
                doc = str(doc)
        if i >= limit:
            break
        d.add_documents([list(tokenizer(doc))])
        if verbose and not i % 100:
            log.info('{}: {}'.format(i, repr(d)[:120]))
    return d
Ejemplo n.º 38
0
    def __buildVectors(self, dataset_file):
        lines = 0
        dct = Dictionary()
        tmp_file = TemporaryFile(mode='w+t', encoding='utf-8')
        for doc_idx, (document,
                      lines) in enumerate(self.__buildDocument(dataset_file)):
            dct.add_documents([document])
            tmp_file.write(' '.join(document) + '\n')
            if doc_idx % 500 == 499:
                G.log.debug('%d', doc_idx)
        if dct.num_docs < self.__LeastDocuments:  # 字典字数太少或文档数太少,没必要聚类
            tmp_file.close()
            raise UserWarning('Too few records[%d]' % dct.num_docs)

        # 去掉低频词,压缩字典
        num_token = len(dct)
        no_below = int(min(self.__NoBelow, int(dct.num_docs / 50)))
        dct.filter_extremes(no_below=no_below,
                            no_above=0.999,
                            keep_n=self.__KeepN)
        dct.compactify()
        G.log.info(
            'Dictionary[%d tokens, reduced from %d] built with [%s]. '
            '[%d]records(%d lines, %d words) in %s', len(dct), num_token,
            self.__ruleSet[0], dct.num_docs, lines, dct.num_pos, dataset_file)
        if len(dct) < self.__LeastTokens:  # 字典字数太少,重新采样
            G.log.info('Too few tokens[%d], Re-sample with next RuleSet].' %
                       (len(dct)))
            tmp_file.close()
            return None, None

        # 构造tf-idf词袋和文档向量
        tfidf_model = TfidfModel(dictionary=dct, normalize=False)
        vectors = np.zeros((dct.num_docs, len(dct)))
        tmp_file.seek(0)
        for doc_idx, new_line in enumerate(tmp_file):
            for (word_idx, tf_idf_value) in tfidf_model[dct.doc2bow(
                    new_line.split())]:  # [(id,tf-idf)...], id是升序
                vectors[doc_idx, word_idx] = tf_idf_value
        G.log.info('[%d*%d]Vectors built, %.2f%% non-zeros.' %
                   (dct.num_docs, len(dct),
                    dct.num_nnz * 100 / len(dct) / dct.num_docs))
        tmp_file.close()
        return dct, vectors
Ejemplo n.º 39
0
class DictionaryLearner(object):
    '''Learn a gensim dictionary from all available documents.'''
    
    def __init__(self, n=4):
        '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.'''
        self._ngram = NgramTransformer(n)
        self._dictionary = Dictionary()
    
    def fit(self, documentstorage, filter_extremes=True):
        '''Fit a dictonary using documents from given documentstorage.'''
        for document in documentstorage.load_iterator(u''):
            text_document = document.text
            ngrams = self._ngram.transform([text_document])
            self._dictionary.add_documents(ngrams)
        if filter_extremes:
            self._dictionary.filter_extremes()

    def get(self):
        return self._dictionary
Ejemplo n.º 40
0
class Text2BowTransformer(TransformerMixin, BaseEstimator):
    """
    Base Text2Bow module
    """

    def __init__(self, prune_at=2000000, tokenizer=tokenize):
        """
        Sklearn wrapper for Text2Bow model.
        """
        self.gensim_model = None
        self.prune_at = prune_at
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        """
        Fit the model according to the given training data.
        """
        tokenized_docs = [list(self.tokenizer(x)) for x in X]
        self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
        return self

    def transform(self, docs):
        """
        Return the BOW format for the input documents.
        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # input as python lists
        if isinstance(docs, string_types):
            docs = [docs]
        tokenized_docs = (list(self.tokenizer(doc)) for doc in docs)
        return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]

    def partial_fit(self, X):
        if self.gensim_model is None:
            self.gensim_model = Dictionary(prune_at=self.prune_at)

        tokenized_docs = [list(self.tokenizer(x)) for x in X]
        self.gensim_model.add_documents(tokenized_docs)
        return self
Ejemplo n.º 41
0
class SeriesCorpus(TextCorpus):
    def __init__(self, series, vocab=None, stem=False, bigram=None,
                 labels=True):
        """ Create a corpus that returns one row at a time out
            of a Pandas Series"""
        self.series = series
        self.metadata = False
        if vocab is not None:
            vocab = set(vocab)
        self.vocab = vocab
        self.labels = labels
        self.kwargs = dict(stem=stem, bigram=bigram)
        logging.info("Building SeriesCorpus")
        self.dictionary = Dictionary()
        self.dictionary.add_documents(self.get_texts())

    def __iter__(self):
        if self.labels:
            for index, line in zip(self.series.index, self.series.values):
                label = ['SENT_%s' % str(index)]
                ls = LabeledSentence(line.split(' '), label)
                yield ls
        else:
            for index, line in self.series.index, self.series.values:
                yield line.split(' ')

    def line_iter(self, line):
        if self.vocab is not None:
            for word in line.split(' '):
                if word in self.vocab:
                    yield word
        else:
            for word in line.split(' '):
                yield word

    def get_texts(self):
        logging.info("Iterating SeriesCorpus")
        for lineno, line in enumerate(self.series.values):
            if self.metadata:
                yield self.line_iter(line), (lineno,)
            else:
                yield self.line_iter(line)
Ejemplo n.º 42
0
def compile_vocab(docs, limit=1e6, verbose=0, tokenizer=Tokenizer(stem=None, lower=None, strip=None)):
    """Get the set of words used anywhere in a sequence of documents and assign an integer id

    This vectorizer is much faster than the scikit-learn version (and only requires low/constant RAM ?).

    >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11))
    >>> d = compile_vocab(gen, verbose=0)
    >>> d
    <gensim.corpora.dictionary.Dictionary ...>
    >>> print(d)
    Dictionary(4 unique tokens: [u'AAA', u'BBB', u'CCC', u'label'])
    >>> sorted(d.token2id.values())
    [0, 1, 2, 3]
    >>> sorted(d.token2id.keys())
    [u'AAA', u'BBB', u'CCC', u'label']
    """
    tokenizer = make_tokenizer(tokenizer)
    d = Dictionary()

    try:
        limit = min(limit, docs.count())
        docs = docs.iterator()
    except (AttributeError, TypeError):
        pass
    for i, doc in enumerate(docs):
        # if isinstance(doc, (tuple, list)) and len(doc) == 2 and isinstance(doc[1], int):
        #     doc, score = docs
        try:
            # in case docs is a values() queryset (dicts of records in a DB table)
            doc = doc.values()
        except AttributeError:  # doc already is a values_list
            if not isinstance(doc, str):
                doc = ' '.join([str(v) for v in doc])
            else:
                doc = str(doc)
        if i >= limit:
            break
        d.add_documents([list(tokenizer(doc))])
        if verbose and not i % 100:
            log.info('{}: {}'.format(i, repr(d)[:120]))
    return d
Ejemplo n.º 43
0
class Text2BowTransformer(TransformerMixin, BaseEstimator):
    """Base Text2Bow module , wraps :class:`~gensim.corpora.dictionary.Dictionary`.

    For more information please have a look to `Bag-of-words model <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.

    """
    def __init__(self, prune_at=2000000, tokenizer=tokenize):
        """
        Parameters
        ----------
        prune_at : int, optional
            Total number of unique words. Dictionary will keep not more than `prune_at` words.
        tokenizer : callable (str -> list of str), optional
            A callable to split a document into a list of each terms, default is :func:`gensim.utils.tokenize`.

        """
        self.gensim_model = None
        self.prune_at = prune_at
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of str
            A collection of documents used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer`
            The trained model.

        """
        tokenized_docs = [list(self.tokenizer(x)) for x in X]
        self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
        return self

    def transform(self, docs):
        """Get the BOW format for the `docs`.

        Parameters
        ----------
        docs : {iterable of str, str}
            A collection of documents to be transformed.

        Returns
        -------
        iterable of list (int, int) 2-tuples.
            The BOW representation of each document.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # input as python lists
        if isinstance(docs, string_types):
            docs = [docs]
        tokenized_docs = (list(self.tokenizer(doc)) for doc in docs)
        return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]

    def partial_fit(self, X):
        """Train model over a potentially incomplete set of documents.

        This method can be used in two ways:
            1. On an unfitted model in which case the dictionary is initialized and trained on `X`.
            2. On an already fitted model in which case the dictionary is **expanded** by `X`.

        Parameters
        ----------
        X : iterable of str
            A collection of documents used to train the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer`
            The trained model.

        """
        if self.gensim_model is None:
            self.gensim_model = Dictionary(prune_at=self.prune_at)

        tokenized_docs = [list(self.tokenizer(x)) for x in X]
        self.gensim_model.add_documents(tokenized_docs)
        return self
class MyCorpus(object):
    '''
    Corpus class for streaming review documents
    '''
    def __init__(self, file_list, file_dir, dictionary = None, mindf = MINDF, maxdf = MAXDF, \
                 maxwords = MAXWORDS, cluster_words = CLUSTER_WORDS, cluster_ul = CLUSTER_UL):
        self.file_list = file_list           # list of cuisine text files
        self.file_dir = file_dir             # directory of cuisine text files
        self.maxwords = maxwords             # maximum number of words to keep after building dictionary from clusters
        self.cluster_words = cluster_words   # maximum number of words to keep from each cluster
        self.cluster_ul = cluster_ul         # upper proportion of reviews to limit for cluster processing
        self.mindf = mindf                   # minimum number of documents to keep word
        self.maxdf = maxdf                   # max proportion of documents to keep word
        self.agglomerate = True              # return clusters as single documents (True) or return single reviews (False)
        if dictionary:
            self.dictionary = dictionary
        else:
            self.dictionary = Dictionary()
            self._build_dict()
            
    def __str__(self):
        return "<MyCorpus at " + str(hex(id(self))) + ">"
        
    def __repr__(self):
        return self.__str__()
    
    def _build_dict(self):
        for filename in self.file_list:
            dictionary = dict()
            num_reviews = 0
            with open(os.path.join(self.file_dir, filename), "rt") as f:
                for line in f:
                    num_reviews += 1
                    words = line[REVIEW_INDEX:].split()
                    for word in set(words):
                        if word not in dictionary:
                            dictionary[word] = 1
                        else:
                            dictionary[word] += 1
                doc = [item for item in dictionary.items() if dictionary[item[0]] > 2 and dictionary[item[0]] / num_reviews < self.cluster_ul]
                doc.sort(key = lambda x: -x[1])
                doc = [word for word, f in doc]
                self.dictionary.add_documents([doc[:self.cluster_words]])
                print("%s added to corpus dictionary!" % (filename,))
        self.dictionary.filter_extremes(self.mindf, self.maxdf, self.maxwords)
        self.dictionary.save("cuisine_dictionary.gensimDict")
        
    def __iter__(self):
        '''
        Iterates through cuisines by combining all reviews for each cuisine into a single
        processed document.  Also stores the length of each processed document
        '''
        if self.agglomerate:
            for filename in self.file_list:
                with open(os.path.join(self.file_dir, filename), "rt") as f:
                    doc = " ".join([line[REVIEW_INDEX:].rstrip() for line in f])
                    yield self.dictionary.doc2bow(doc.split())
        else:
            reviewIDs = set()
            for filename in self.file_list:
                with open(os.path.join(self.file_dir, filename), "rt") as f:
                    for line in f:
                        id = line[:RATING_INDEX - 1]
                        if id not in reviewIDs:
                            reviewIDs.update([id])
                            doc = line[REVIEW_INDEX:].rstrip()
                            yield self.dictionary.doc2bow(doc.split())
def main():
    parser = ArgumentParser(
        description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information"
    )
    parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)")
    parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it")
    parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki")
    parser.add_argument("--model-id", default="model", help="Filename for created model.")
    parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).")
    parser.add_argument("--n-topics", default=10, help="Number of topics to model.")
    parser.add_argument("--n-passes", default=1, help="Number of passes for LDA  model.")
    parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.")
    parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.")
    parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents")
    parser.add_argument("--index", help="Elasticsearch: index to read from.")
    parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.")
    parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.")
    parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.")

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ["es", "wiki", "file"]:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ["wiki"]:
        logging.error("--dump-file required for wiki dataset")
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == "es" and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = "%s_%s_%d" % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = "%s/%s" % (data_dir, model_fn)
    if model_type == "word2vec":
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == "es":
        logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(
            read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es
        )
    elif data_type == "wiki":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
    elif data_type == "file":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words("norwegian"))
    if not vocab_file or model_type == "vocabulary":
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + ".vocab")
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == "vocabulary":
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == "lsi":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab)
    elif model_type == "lda":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab)

    elif model_type == "word2vec":
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == "hdp":
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Ejemplo n.º 46
0
class YahooDictionary:

    def __init__(self, source_file, vocab_size=20000, max_ans_len=1000, max_sub_len=100, max_cont_len=500, dict_file_name=''):
        assert os.path.exists(source_file), 'The file "%s" was not found' % source_file

        self.source_file = source_file
        self.vocab = Dictionary()
        self.vocab_size = vocab_size

        print('Creating XML tree...')
        tree = ET.parse(source_file)
        self.root = tree.getroot()

        # maximum lengths for everything
        self.max_ans_len = max_ans_len
        self.max_sub_len = max_sub_len
        self.max_cont_len = max_cont_len

        print('Creating dictionary...')
        self._create_dictionary()

    @staticmethod
    def tokenize(text):
        return gensim.utils.tokenize(text, to_lower=True)

    def _create_dictionary(self):
        categories = set()

        # create dictionary
        for vespaadd in self.root.iter('vespaadd'):
            doc = vespaadd.find('document')

            subject_text = YahooDictionary.tokenize(doc.find('subject').text)
            content_text = YahooDictionary.tokenize(doc.find('content').text)

            self.vocab.add_documents([subject_text, content_text], prune_at=self.vocab_size)

            # category
            categories.add(doc.find('cat').text)

            # answers
            answers = [YahooDictionary.tokenize(answer.text) for answer in doc.find('nbestanswers').getchildren()]
            self.vocab.add_documents(answers, prune_at=self.vocab_size)

        self.cat_to_idx = dict((c, i+1) for i, c in enumerate(categories))
        self.idx_to_cat = dict((i+1, c) for i, c in enumerate(categories))

    def get_docs(self):

        all_answers = []
        all_subjects = []
        all_contents = []
        all_categories = []

        # create dictionary
        for vespaadd in self.root.iter('vespaadd'):
            doc = vespaadd.find('document')

            # subject and content
            subject_text_iter = YahooDictionary.tokenize(doc.find('subject').text)
            content_text_iter = YahooDictionary.tokenize(doc.find('content').text)

            subject_enc = [self.vocab.token2id[x] for x in itertools.islice(subject_text_iter, self.max_sub_len)]
            content_enc = [self.vocab.token2id[x] for x in itertools.islice(content_text_iter, self.max_cont_len)]

            # category index
            category = self.cat_to_idx[doc.find('cat').text]

            # answers
            answers = [YahooDictionary.tokenize(answer.text) for answer in doc.find('nbestanswers').getchildren()]

            for answer in answers:
                answer_enc = [self.vocab.token2id[x] for x in itertools.islice(answer, self.max_ans_len)]

                all_categories.append(category)
                all_subjects.append(subject_enc)
                all_contents.append(content_enc)
                all_answers.append(answer_enc)

        return pad_sequences(all_answers, self.max_ans_len),\
               pad_sequences(all_subjects, self.max_sub_len),\
               pad_sequences(all_contents, self.max_cont_len),\
               np.array(all_categories)