Ejemplo n.º 1
0
class EnronCorpus(TextCorpus):
    def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
    Initialize the corpus. This scans through all the emails once, to determine the corpus
    vocabulary. (only the first `keep_words` most frequent words that appear in at least 
    `no_below` documents are kept).
    """
        self.root_name = root_name
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
    Walk the file system, strip punctuation, normalize all numbers to be '2'.
    """
        filenames = walk_os(self.root_name)
        opened_files = gen_open(filenames)
        stripped_files = strip_punct(opened_files)
        length = 0
        for email in stripped_files:
            if len(email) > ARTICLE_MIN_CHARS:
                length += 1
                print "Iteration: %i" % length
                yield tokenize(email)
        self.length = length  # cache corpus length
Ejemplo n.º 2
0
    def loadDictionary(fname, mapping_only=True):
        """
        Load previously stored mapping between words and their ids.

        The result can be used as the `id2word` parameter for input to transformations.
        """
        if mapping_only:
            result = {}
            for lineNo, line in enumerate(open(fname)):
                cols = line[:-1].split('\t')
                if len(cols) == 2:
                    wordId, word = cols
                elif len(cols) == 3:
                    wordId, word, dfs = cols
                else:
                    raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip()))
                result[int(wordId)] = word # dfs not used
        else:
            result = Dictionary()
            for lineNo, line in enumerate(open(fname)):
                cols = line[:-1].split('\t')
                if len(cols) == 3:
                    wordId, word, dfs = cols
                else:
                    raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip()))
                wordId = int(wordId)
                result.token2id[word] = wordId
                result.dfs[wordId] = int(dfs)

        return result
Ejemplo n.º 3
0
class WordCorpus(BaseCorpus):
    """\
    Wrapper around a `gensim.corpora.dictionary.Dictionary`.

    This is a light-weight alternative to `CableCorpus` to create an initial
    word dictionary::

        wd = WordCorpus()
        wd.add_text('ref-1', 'bla bla')
        # add more texts
        wd.dct.filter_extremes()

        corpus = CableCorpus('/my/directory/', wd.dct)
        corpus.add_text('ref-1', 'bla bla')
        # add more texts
        corpus.close()
    """
    def __init__(self, dct=None, tokenizer=None):
        """\
        Initializes the wrapper.

        `dct`
            An existing Dictionary or ``None`` if a new Dictionary should be
            created (default)
        `tokenizer`
            A tokenizer function or ``None``, see `BaseCorpus`
        """
        super(WordCorpus, self).__init__(tokenizer)
        self.dct = Dictionary() if dct is None else dct

    def add_words(self, reference_id, words):
        self.dct.doc2bow(words, True)
Ejemplo n.º 4
0
def get_corpus_dictionary():
    """Crafts a toy corpus and the dictionary associated."""
    # Toy corpus.
    corpus = [
        ['carrot', 'salad', 'tomato'],
        ['carrot', 'salad', 'dish'],
        ['tomato', 'dish'],
        ['tomato', 'salad'],

        ['car', 'break', 'highway'],
        ['highway', 'accident', 'car'],
        ['moto', 'break'],
        ['accident', 'moto', 'car']
    ]

    dictionary = Dictionary(corpus)

    # Transforming corpus with dictionary.
    corpus = [dictionary.doc2bow(doc) for doc in corpus]

    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token

    return corpus, dictionary
Ejemplo n.º 5
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def create_dictionaries(train=None,
                        test=None,
                        model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
Ejemplo n.º 7
0
def doc_to_gensim(doc, lemmatize=True,
                  filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.

    Args:
        doc (``spacy.Doc``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list((int, int)): bag-of-words document, a list of (integer word ID, word count)
            2-tuples
    """
    gdict = Dictionary()
    words = extract.words(doc,
                          filter_stops=filter_stops,
                          filter_punct=filter_punct,
                          filter_nums=filter_nums)
    if lemmatize is True:
        gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
    else:
        gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)

    return (gdict, gdoc)
Ejemplo n.º 8
0
class CorpusOfMethodContents(TextCorpus):
    
    def __init__(self):
        self.mapMethodFQNtoIndex = {}
        self.methodFqns = []
        self.methodContents = []
        TextCorpus.__init__(self)
        
    def addDocument(self, methodFqn, words):
        if methodFqn not in self.mapMethodFQNtoIndex:
            self.methodFqns.append(methodFqn)
            self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1
            self.methodContents.append(words)
            self.dictionary.doc2bow(words, allow_update = True)
        else:
            self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words
            self.dictionary = Dictionary()
            self.dictionary.add_documents(self.get_texts())
    
    def getMethodContentsForFqn(self, fqn):
        if fqn in self.mapMethodFQNtoIndex.keys():
            return self.methodContents[self.mapMethodFQNtoIndex[fqn]]
        return None
    
    def get_texts(self):
        for content in self.methodContents:
            yield content
Ejemplo n.º 9
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print 'No data provided...'
Ejemplo n.º 10
0
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
    ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
        be a 3-tuple of the picklefile names in the following order:
        
        (title, body, tags)
        
        If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
    '''
    utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
    for eid in xrange(n):
        for row in row_stream(splits_template % eid):
            ID, title, body, tags = row
            utitledict.doc2bow(title.split(), allow_update=True)
            ubodydict.doc2bow(body.split(), allow_update=True)
            utagdict.doc2bow(tags.split(), allow_update=True)
    
    assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
    print "Before filtering..."
    print "utitledict:", utitledict
    print "ubodydict:", ubodydict
    print "utagdict:", utagdict
    
    if save_pickle_tup:
        assert len(save_pickle_tup) == 3
        if save_pickle_tup[0]:
            print "saving utitledict..."
            utitledict.save(save_pickle_tup[0])
        if save_pickle_tup[1]:
            print "saving ubodydict..."
            ubodydict.save(save_pickle_tup[1])
        if save_pickle_tup[2]:
            print "saving utagdict..."
            utagdict.save(save_pickle_tup[2])
            
    return (utitledict, ubodydict, utagdict)
Ejemplo n.º 11
0
def create_dictionary(analyzed_items_path, dictionary_path=None):
    dictionary = Dictionary(iter_docs(analyzed_items_path))

    if dictionary_path:
        dictionary.save(dictionary_path)

    return dictionary
Ejemplo n.º 12
0
 def build_dictionary(self):
     documents = ReadThreads(
         self.board, input_dir=self.input_dir, file_type='phrases',
         return_func=lambda x, y: y.split())
     dictionary = Dictionary(documents)
     dictionary.save(f'{self.board}.dictionary')
     
     return dictionary
Ejemplo n.º 13
0
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True,
                   filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)
class SublexicalizedCorpus(TextCorpus):
    def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True,
                 n_proc=1):
        self.order = order

        self.clean_func = clean_func
        self.base_corpus = base_corpus
        self.word_limit = word_limit
        self.n_proc = n_proc

        super(SublexicalizedCorpus, self).__init__()

        self.dictionary = Dictionary()

        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())

    def get_texts(self):
        a_count = 0
        t_count = 0

        texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts())

        pool = multiprocessing.Pool(self.n_proc)

        start = time.clock()
        prev = start

        for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
            for tokens in pool.imap_unordered(process, group):
                a_count += 1

                cur = time.clock()

                if cur - prev > 60:
                    logging.info("Sublexicalized %d in %d seconds, %.0f t/s"
                                 % (t_count, cur - start, t_count*1. / (cur - start)))

                    prev = cur

                t_count += len(tokens)

                yield tokens

                if self.word_limit and t_count > self.word_limit:
                    break

        pool.terminate()

        end = time.clock()
        logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s"
                     % (t_count, end - start, t_count*1. / (end - start)))

        self.length = t_count
Ejemplo n.º 15
0
def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
Ejemplo n.º 16
0
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
    ''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
    unfiltered_dict = Dictionary()
    for eid in xrange(n):
        unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
    print "Before filtering,", unfiltered_dict
    if save_pickle:
        print "\nsaving..."
        unfiltered_dict.save(save_pickle)
    
    return unfiltered_dict
Ejemplo n.º 17
0
class tip_rec:

	def __init__(self, num_topics = 15):
		self.numtopics = num_topics
		self.topic_dict = dict(enumerate(np.zeros(num_topics)))
		self.user_dict = {}
		self.model = None
		self.worddict = {}
		self.mydict = None


	def train(self, df):
		self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()}
		cv = CV(stop_words='english')
		X = cv.fit_transform(df['context'])
		vocab = cv.vocabulary_.keys()
		self.worddict=dict([(i, s) for i, s in enumerate(vocab)])
		self.mydict = Dictionary()
		self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict)
		self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict)
		for i in df.iterrows():
			if i[1]['context'] == '':
				continue
			else:
				values = new_model[mydict.doc2bow(i[1]['context'].split())]
				for val in values:
					if val[0] in user_dict[i[1].sender].keys():
						if i[1].amt == '':
							continue
						user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt)
						continue
					user_dict[i[1].sender][val[0]] = val[1]
		for i in user_dict.keys():
			norm_const = sum(user_dict[i].values())
			for j in user_dict[i].keys():
				user_dict[i][j] = user_dict[i][j]/norm_const

	def predict(self, text, username = ''):
		topics = self.model[self.mydict.doc2bow(text.split())]
		doc_aff = np.zeros(self.numtopics)
		for i in topics:
			doc_aff[i[0]] = i[1]
		if username == '':
			returndict = {}
			for user in self.user_dict.keys():
				user_aff = np.array(self.user_dict[user].values())    
				score = np.linalg.norm(user_aff - doc_aff)
				returndict[user] = score
			return returndict
		else:
			user_aff = np.array(self.user_dict[username].values())    
			score = np.linalg.norm(user_aff - doc_aff)
			return (username, score)
Ejemplo n.º 18
0
 def _load_vocab(self,fname):
     logging.info("loading plain-text file:{}".format(fname))            
     src_file = codecs.open(fname, 'rb', 'utf-8')
     dictionary = Dictionary()
     
     num_instances = 0            
     for term in src_file:            
         dictionary.doc2bow(term.strip().lower().encode('utf-8').split(), allow_update=True)
         num_instances += 1
         
     logging.info("processed {} instances".format(num_instances))
     self.dictionary = dictionary
Ejemplo n.º 19
0
    def __init__(self, fname, dictionary=None):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.
        """
        self.fname = fname
        self.metadata = False

        if dictionary is None:
            dictionary = Dictionary()
            for text in self.get_texts():
                dictionary.add_documents([text])
        self.dictionary = dictionary
Ejemplo n.º 20
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
Ejemplo n.º 21
0
def merge_dictionaries(dictionaries_path, merged_dictionary_path=None):
    dict_paths = list(iglob(dictionaries_path))

    final_dictionary = Dictionary.load(dict_paths[0])

    for dict_path in dict_paths[1:]:
        dictionary = Dictionary.load(dict_path)

        final_dictionary.merge_with(dictionary)

    if merged_dictionary_path:
        final_dictionary.save(merged_dictionary_path)

    return final_dictionary
Ejemplo n.º 22
0
def analyze_top_dfs(tokendict, tagdict, cutoff_factor=1):
    ''' Provided gensim-dicts `tokendict` and `tagsdict`, show the top word frequencies. '''
    if type(tokendict) == str:
        tokendict = Dictionary.load(tokendict)
    if type(tagdict) == str:
        tagdict = Dictionary.load(tagdict)
    
    max_tag_df = max(tagdict.dfs.iteritems(), key=operator.itemgetter(1))
    sorted_dfs = sorted(tokendict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
    print "count threshold: %-15s\t%d" % (tagdict[max_tag_df[0]], max_tag_df[1])
    print "----------------------------------------------"
    for tup in sorted_dfs[:100]:
        if tup[1] > max_tag_df[1] * cutoff_factor:
            print "%-15s\t%d" % (tokendict[tup[0]][:15], tup[1])
        else: break
Ejemplo n.º 23
0
    def build_dictionary(self):
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 构建训练库字典
        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]

        train_document = map(lambda x: x.split(),self.__seg_sentence__)

        gensim_dict = Dictionary.from_documents(train_document)

        # 更新字典,再字典中添加特殊符号,其中
        # U表示未知字符,即OOV词汇
        gensim_dict.add_documents([[u'UNKOWN']])
        logging.debug('更新字典,再字典中添加特殊符号(UNKOWN),之后字典大小为:%d' % (len(gensim_dict.keys())))
        # print '更新字典,再字典中添加特殊符号,之后字典大小为:%d' % (len(gensim_dict.keys()))
        self.__gensim_dict__ = gensim_dict
        self.__vocabulary_size__ = len(gensim_dict.keys())
        logging.debug('训练库字典为:%d' % (self.__vocabulary_size__))
        print '训练库字典为:%d' % self.__vocabulary_size__

        logging.debug(u'字典有:%s' % (','.join(gensim_dict.token2id.keys())))
        print u'字典有:%s' % (','.join(gensim_dict.token2id.keys()))
        # word2embedding = {}
        # unknow_token_index = self.__gensim_dict__.token2id[u'UNKOWN']
        embedding_weights = np.zeros((self.__vocabulary_size__ + 1, self.__word_embedding_length__ ))
        for key,value in gensim_dict.token2id.items():
            embedding_weights[value,:] = self.get_w2vEmbedding(key)
        # todo 创建词向量字典
        self.__embedding_weights__ = embedding_weights
Ejemplo n.º 24
0
def main(args):
    if args.corpus_type != "wiki":
        if args.processed_corpus_save_path is not None:
            raise ValueError("Processed corpus saving only supported " "for 'wiki' corpus type")

    kwargs = {}
    if args.dictionary_path is not None:
        kwargs["dictionary"] = Dictionary.load(args.dictionary_path)
    if args.dictionary_out_path is not None:
        kwargs["dictionary_save_path"] = args.dictionary_out_path

    if args.corpus_type == "wiki" and args.processed_corpus_save_path is not None:
        kwargs["sentences_save_path"] = args.processed_corpus_save_path

    logging.debug("Building corpus")
    corpus = CORPUS_TYPES[args.corpus_type](args.corpus_path, **kwargs)
    documents = corpus.get_texts()

    logging.debug("Now beginning VSM construction with Word2Vec")

    model = Word2Vec(
        sentences=documents,
        vocab_path=args.vocab_path,
        window=args.window_size,
        drop_capitals=args.drop_capitals,
        min_count=args.minimum_token_count,
        size=args.vector_dimensions,
        workers=multiprocessing.cpu_count(),
    )

    model.save(args.out_path)

    if args.vocab_out_path is not None:
        model.save_vocab(args.vocab_out_path)
def user_lda(lda, dictionary_path, textyielder):
    id2word = Dictionary.load_from_text(dictionary_path)
    ret = {}
    for user, text in text_yielder():
        bow = id2word.doc2bow(UserCorpus.text2tokens(text))
        ret[user] = lda[bow]
    return ret
Ejemplo n.º 26
0
    def __init__(self, corpus_file):
        """
        Args:
            corpus_file -- 语料文件,第一列是类别,后面都是标签
        """
        corpus = []
        categories = []
        self._category_distribution = {}  # 统计各个类别的样本数
        self._words_cate = {}  # 统计每个词(标签、特征)下的类别样本数
        self._words_sample_count = {}
        self._info_gain = {}
        with open(corpus_file, 'r') as documents:
            for line in documents:
                words = line.strip().split()
                if len(words) <= 1:
                    continue
                categories.append(words[0])
                corpus.append(words[1:])
                if words[0] not in self._category_distribution:
                    self._category_distribution[words[0]] = 0
                self._category_distribution[words[0]] += 1

                # 统计词(标签、特征)和类别的共现次数,用于计算条件熵
                for word in set(words[1:]):
                    if word not in self._words_cate:
                        self._words_cate[word] = {}
                        self._words_sample_count[word] = 0
                    if words[0] not in self._words_cate[word]:
                        self._words_cate[word][words[0]] = 0
                    self._words_cate[word][words[0]] += 1
                    self._words_sample_count[word] += 1

        self._common_dictionary = Dictionary(corpus)
        self._corpus = corpus
        self._categories = categories
Ejemplo n.º 27
0
Archivo: lda.py Proyecto: freygit/36
    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
Ejemplo n.º 28
0
	def train(self, df):
		self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()}
		cv = CV(stop_words='english')
		X = cv.fit_transform(df['context'])
		vocab = cv.vocabulary_.keys()
		self.worddict=dict([(i, s) for i, s in enumerate(vocab)])
		self.mydict = Dictionary()
		self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict)
		self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict)
		for i in df.iterrows():
			if i[1]['context'] == '':
				continue
			else:
				values = new_model[mydict.doc2bow(i[1]['context'].split())]
				for val in values:
					if val[0] in user_dict[i[1].sender].keys():
						if i[1].amt == '':
							continue
						user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt)
						continue
					user_dict[i[1].sender][val[0]] = val[1]
		for i in user_dict.keys():
			norm_const = sum(user_dict[i].values())
			for j in user_dict[i].keys():
				user_dict[i][j] = user_dict[i][j]/norm_const
Ejemplo n.º 29
0
def plot_dict_hist(gdict):
    ''' Provided gensim-dict `gdict`, plot hist statistics '''
    if type(gdict) == str:
        gdict = Dictionary.load(gdict)
    sorted_dfs = sorted(gdict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
    y = [tup[1] for tup in sorted_dfs]
    x = arange(0, len(y))
    
    plt.figure(figsize=(8,5));
    plt.loglog(x, y);
    plt.grid();
    plt.xlabel("Token rank");
    plt.ylabel("Document count");
    
    cdf = np.empty(len(y))
    delta(y, cdf)
    cdf /= np.max(cdf) # normalize
    
    x50 = x[cdf > 0.50][0]
    x80 = x[cdf > 0.80][0]
    x90 = x[cdf > 0.90][0]
    x95 = x[cdf > 0.95][0]
    
    plt.axvline(x50, color='c');
    plt.axvline(x80, color='g');
    plt.axvline(x90, color='r');
    plt.axvline(x95, color='k');
    
    print "50%\t", x50
    print "80%\t", x80
    print "90%\t", x90
    print "95%\t", x95
 def __init__(self, input=None,create_dictionary=True):
     super(DefaultJsonCorpus, self).__init__()
     self.input = input
     self.dictionary = Dictionary()
     self.metadata = False
     if create_dictionary:
         self.dictionary.add_documents(self.get_texts())
Ejemplo n.º 31
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1-创建索引映射的单词
        2-创建一个单词到矢量映射
        3-转换训练和测试词典
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
Ejemplo n.º 32
0
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' 单词变集合
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0

        f12.write(str(combined))
        f12.write('\n')

        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
Ejemplo n.º 33
0
def main():
    
    articles_path = '/texts_corrected/*.txt'
    stopword_path = '/stopwords.txt'
    resultspath = '/results/'
    location_path = '/locations.txt'
    tot_topic_vectors_path = resultspath + 'time200msc_topic_vectors_beta0_1.csv'
    tot_topic_mixtures_path = resultspath + 'time200msc_topic_mixtures_beta0_1.csv'
    tot_topic_shapes_path = resultspath + 'time200msc_topic_shapes_beta0_1.csv'
    tot_pickle_path = resultspath + 'time200iter_beta0_1.pickle'
    coherence_pickle_path = resultspath + 'coherence.pickle'
    seed_file = resultspath + '/seedwords.txt'
    
    tot = stot_model()
 
    
    articles,date,vocab = tot.initDataset(articles_path, stopword_path, location_path)
    
    ##save variable for coherence measures
    dictionary = Dictionary(articles)
    corpus = [dictionary.doc2bow(article) for article in articles]
    
    coherence_pickle = open(coherence_pickle_path, 'wb')
    pickle.dump(dictionary, coherence_pickle)
    pickle.dump(corpus, coherence_pickle)
    coherence_pickle.close()
    
    #resume with modelling process
    tot.init_seedwords(seed_file, vocab)  
    param = tot.initParam(articles, date, vocab)
    theta,phi,psi = tot.TopicsOverTimeGibbsSampling(param)
    np.savetxt(tot_topic_vectors_path, phi, delimiter=',')
    np.savetxt(tot_topic_mixtures_path, theta, delimiter=',')
    np.savetxt(tot_topic_shapes_path, psi, delimiter=',')
    tot_pickle = open(tot_pickle_path, 'wb')
    pickle.dump(param, tot_pickle)
    tot_pickle.close()
def create_dictionaries(model=None, combined=None):
    ''' 这个函数做3件事
        1- 创建一个单词到索引的映射
        2- 创建一个单词到词向量的映射
        3- 对训练集和测试集的词典进行转换
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  词频小于10->0 所以v->k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用, 把combined中的词语转换成对应的索引
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # 词频小于10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 35
0
    def buildDic(self, model=None, words=None):
        '''
        构建词典,
        :param model:   word2vec模型
        :param words:   结巴分词后所有的文本内容
        :return:        返回每个词语的索引(词语-索引),词向量(词语-向量),以及每个句子所对应的词语索引(下标索引)
        '''
        if (model is not None) and (words is not None):
            # 初始化一个词典
            dict = Dictionary()
            # model.vocab.keys() 为 word2vec 中所有的词,设置 allow_update=True 则每个词出现一个,频率就会增加一次
            # 转换为词袋模型
            dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
            # 重新生成字典:key 是单词,value 是单词对应的下标。其中 k 为下标索引,v 为 字典中包含的词,
            w2indx = {v: k + 1 for k, v in dict.items()}
            # key 是单词,value 是对应的词向量
            w2vec = {word: model[word] for word in w2indx.keys()}

            # 获取一句话所对应的词语索引
            def parseDataset(words):
                data = []
                for sentence in words:
                    new_txt = []
                    for word in sentence:
                        try:
                            new_txt.append(w2indx[word])
                        except:
                            new_txt.append(0)
                    data.append(new_txt)
                return data

            combined = parseDataset(words)
            # 对长短不同的时序统一维度。
            combined = sequence.pad_sequences(combined, maxlen=self.maxlen)
            return w2indx, w2vec, combined
        else:
            print("模型或数据导入失败")
Ejemplo n.º 36
0
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                 filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
        """Initialize the corpus.

        Unless a dictionary is provided, this scans the corpus once,
        to determine its vocabulary.

        Parameters
        ----------
        fname : str
            Path to file with wikipedia dump.
        processes : int, optional
            Number of processes to run, defaults to **number of cpu - 1**.
        lemmatize : bool
            Whether to use lemmatization instead of simple regexp tokenization.
            Defaults to `True` if *pattern* package installed.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
            (this needs **really long time**).
        filter_namespaces : tuple of str
            Namespaces to consider.
        tokenizer_func : function, optional
            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
            Need to support interface:
            tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
        article_min_tokens : int, optional
            Minimum tokens in article. Article will be ignored if number of tokens is less.
        token_min_len : int, optional
            Minimal token length.
        token_max_len : int, optional
            Maximal token length.
        lower : bool, optional
             If True - convert all text to lower case.

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower
        self.dictionary = dictionary or Dictionary(self.get_texts())
Ejemplo n.º 37
0
    def _load(self, tfidf_path, dictionary_path):
        """
        If specified, attempts to load gensim TfidfModel from `tfidf_path`
        and gensim Dictionary from `dictionary_path`.

        Parameters
        ----------
        tfidf_path: str
            File-path designating where self.tfidf should be saved.
        dictionary_path: str
            File-path designating where self.dictionary should be saved.
        """
        from gensim.models import TfidfModel
        from gensim.corpora.dictionary import Dictionary
        if not os.path.exists(tfidf_path):
            raise IOError(
                'The provided file path to the TfidfModel was not found.'
                'Please ensure that the argument is the correct path.')
        if not os.path.exists(dictionary_path):
            raise IOError(
                'The provided file path to the Dictionary was not found.'
                'Please ensure that the argument is the correct path.')
        self.tfidf = TfidfModel().load(tfidf_path)
        self.dictionary = Dictionary().load(dictionary_path)
def train(docs):
    num_topics = lda_cfg("topics")
    epochs = lda_cfg("epochs")
    label = f'{datetime.now().isoformat(".", timespec="minutes")}({num_topics}-topics,{epochs}-epochs)'

    log_path = config("path.lda-log").format(label)
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    logging.basicConfig(filename=log_path,
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=lda_cfg("word-extremes.min-count"),
                               no_above=lda_cfg("word-extremes.max-freq"))
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    model = LdaMulticore(corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         passes=epochs,
                         eval_every=lda_cfg.dict_like.get("eval-every"),
                         chunksize=lda_cfg("chunk-size"))

    return label, model, dictionary, corpus
Ejemplo n.º 39
0
 def __init__(self,
              pages_gen,
              processes=None,
              lemmatize=utils.has_pattern(),
              dictionary=None):
     self.pages_gen = pages_gen
     self.metadata = False
     if processes is None:
         processes = max(1, multiprocessing.cpu_count() - 1)
     self.processes = processes
     self.lemmatize = lemmatize
     if dictionary is None:
         self.dictionary = Dictionary(self.get_texts())
     else:
         self.dictionary = dictionary
Ejemplo n.º 40
0
def vect2gensim(vectorizer, dtmatrix):
    # transform sparse matrix into gensim corpus and dictionary
    start = time()
    corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix,
                                                       documents_columns=False)
    dictionary = Dictionary.from_corpus(
        corpus_vect_gensim,
        id2word=dict(
            (id, word) for word, id in vectorizer.vocabulary_.items()))
    end = time()
    print(
        "Transform vector model to gensim format ... done in {0:0.3f} miliseconds"
        .format((end - start) * 1000))

    return (corpus_vect_gensim, dictionary)
Ejemplo n.º 41
0
def main():
    logformat = '%(asctime)s %(name)-12s: %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=logformat)
    kera = NOB_kera()
    es = Elasticsearch(port=9201)
    mod = LdaModel.load(modelfile)
    vocab = Dictionary.load(vocabulary)
    tfidf = TfidfModel(dictionary=vocab)
    results = []
    for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf):
        res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es)
        results.append({'topics': topics, 'result': res, 'topicid': topicid})
    results = add_keywords(results, kera)
    df = pd.DataFrame(results)
    df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
Ejemplo n.º 42
0
def load_data():
    '''this function loads up the already processed data with all of the nested lists properly reformatted as lists, and loads up the dictionaries'''
    df = pd.read_csv('data/processed_full.tsv', sep='\t')
    df['english_tokens'] = df['english_tokens'].apply(
        lambda x: x.strip("['']").split("', '"))
    df['french_tokens'] = df['french_tokens'].apply(
        lambda x: x.strip("['']").split("', '"))
    df['english_bow'] = df['english_bow'].apply(str_to_int)
    df['french_bow'] = df['french_bow'].apply(str_to_int)
    df['english_padded'] = df['english_padded'].apply(str_to_int)
    df['french_padded'] = df['french_padded'].apply(str_to_int)
    df = df.drop('Unnamed: 0', axis=1)

    eng = Dictionary.load('data/Dictionaries/eng')
    fren = Dictionary.load('data/Dictionaries/fren')

    # create ML data
    X_eng = np.vstack(df['english_padded'].values)
    y_fren = np.vstack(df['french_padded'].values)

    y_fren = y_fren.reshape(*y_fren.shape, 1)
    X_eng = X_eng.reshape(*X_eng.shape, 1)

    return df, eng, fren, X_eng, y_fren
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 44
0
    def create_dictionaries(cls, model=None, combined=None):
        """ Function does are number of Jobs:
            1- Creates a word to index mapping
            2- Creates a word to vector mapping
            3- Transforms the Training and Testing Dictionaries
        """
        def _parse_dataset(sentences):
            """Words become integers
                将每一个句子中的每个词用词向量存在的词的索引表示出来,
                如果词没有在索引中出现,则标为0
            """
            data = []
            for sentence in sentences:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except KeyError:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        if combined is not None and model is not None:
            gensim_dict = Dictionary()
            gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
            w2indx = {v: k + 1
                      for k, v in gensim_dict.items()}  # 所有频数超过5的词语的索引
            w2vec = {word: model[word]
                     for word in w2indx.keys()}  # 所有频数超过5的词语的词向量
            combined = _parse_dataset(combined)
            combined = sequence.pad_sequences(
                combined,
                maxlen=cls.maxlen)  # 每个句子所含词语对应的索引,所有句子中含有频数小于5的词语,索引为0
            return w2indx, w2vec, combined
        else:
            print('No data provided...')
Ejemplo n.º 45
0
def create_dictionaries(model=None, combined=None):
    #创造辞典 1-创建单词到索引的映射 2-创建单词到矢量的映射 3-转换培训和测试词典

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()}  # word => index 词的索引
        f = open("../model/word2index.txt", 'w',
                 encoding='utf8')  #word2index,txt文件是如何生成的?
        for key in w2indx:
            f.write(str(key))
            f.write(' ')
            f.write(str(w2indx[key]))
            f.write('\n')
        f.close()
        w2vec = {word: model[word] for word in w2indx.keys()}  # word => vector

        def parse_dataset(combined):  # 解析数据集  闭包(函数内部的函数)临时使用
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word => index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # 句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 46
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 ,so k+1
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # all index of word with freq>10,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in
                 w2indx.keys()}  # all index of word vectors with freq>10, (word->model(word))

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined,
                                          maxlen=maxlen)  # index to every word in every sentence, when freq < 10, index = 0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 47
0
def create_dictionaries(model=None, combined=None):
    '''
    Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    :param model:
    :param combined:
    :return:
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(list(model.wv.vocab.keys()), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in list(gensim_dict.items())}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word]
                 for word in list(w2indx.keys())}  # 所有频数超过10的词语的词向量

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 48
0
def transform_data(model, x_train, y_train, x_test, y_test):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

    w2indx = {v: k + 1 for k, v in gensim_dict.items()}
    w2vec = {word: model[word] for word in w2indx.keys()}

    def parse_data(x, y):

        for key in range(len(y)):
            txt = x[key].lower().replace('\n', '').split()
            new_txt = []
            for word in txt:
                try:
                    new_txt.append(w2indx[word])
                except:
                    new_txt.append(0)
            x[key] = new_txt
        return x, y

    x_train, y_train = parse_data(x_train, y_train)
    x_test, y_test = parse_data(x_test, y_test)

    return w2indx, w2vec, x_train, y_train, x_test, y_test
def get_vocab(tweets=None):
    if 'vocab_sentiment' in os.listdir('.'):
        if not tweets:
            print("Loading vocabulary...")
            vocab = Dictionary.load('vocab_sentiment')
            print("Loaded vocabulary")
            return vocab
        response = input('Vocabulary found. Do you want to load it? (Y/n)'\
                             ': ')
        if response.lower() in ['n', 'no', 'nah', 'nono', 'nahi', 'nein']:
            if not tweets:
                tweets, labels = export()
                del labels
            return create_vocab(tweets)
        else:
            print("Loading vocabulary...")
            vocab = Dictionary.load('vocab_sentiment')
            print("Loaded vocabulary")
            return vocab
    else:
        if not tweets:
            tweets, labels = export()
            del labels
        return create_vocab(tweets)
Ejemplo n.º 50
0
def main():
    texts = [
        ['human', 'interface', 'computer'],
        ['survey', 'user', 'computer', 'system', 'response', 'time'],
        ['eps', 'user', 'interface', 'system'],
        ['system', 'human', 'system', 'eps'],
        ['user', 'response', 'time'],
        ['trees'],
        ['graph', 'trees'],
        ['graph', 'minors', 'trees'],
        ['graph', 'minors', 'survey']
    ]

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)
    badLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2)

    goodcm = CoherenceModel(model=goodLdaModel, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v')
    badcm = CoherenceModel(model=badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')

    print(goodcm.get_coherence())
    print(badcm.get_coherence())
def create_dictionaries(train=None, test=None, model=None):
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data

        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
Ejemplo n.º 52
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # the index of a word which have word vector is not 0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # integrate all the corresponding word vectors into the word vector matrix
        w2vec = {word: model[word] for word in w2indx.keys()}

        # a word without a word vector is indexed 0,return the index of word
        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in list(sentence):
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # unify the length of the sentence with the pad_sequences function of keras
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # return index, word vector matrix and the sentence with an unifying length and indexed
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 53
0
def pre_processing():
    global vocab,model;
    try:
        model = load_model('SentimentAnalysis/model_nn.h5')
    except IOError:
        if 'model_nn.tar.gz' not in os.listdir('SentimentAnalysis'):
            raise IOError("Could not find Sentiment Analysis model. Ensure model "\
                      "is present in: ./SentimentAnalysis")
        else:
            process = subprocess.Popen("cd SentimentAnalysis/; "\
                                   "tar -zxf model_nn.tar.gz; cd ..",
                                   shell=True, stdout=subprocess.PIPE)
            process.wait()
            model = load_model('/content/PClub-Project-master/SentimentAnalysis/model_nn.h5')
    vocab = Dictionary.load('SentimentAnalysis/vocab_sentiment')
Ejemplo n.º 54
0
 def __init__(self, data=None, dictionary=None):
     """ initialize, data should be provided, only when unpickling class object it is not needed!"""
     self.data = data
     self.model = None
     self.num_topics = None
     self.iterations = None
     self.random_state = None
     self.dictionary = dictionary
     if self.data is not None:
         if self.dictionary is None:
             self.dictionary = Dictionary(self.data)
         self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
     else:
         self.dictionary = None
         self.corpus = None
     self.distributed = None
     self.chuncksize = None
     self.passes = None
     self.update_every = None
     self.alpha = None
     self.eta = None
     self.decay = None
     self.offset = None
     self.eval_every = None
     self.gamma_threshold = None
     self.minimum_probability = None
     self.ns_conf = None
     self.minimum_phi_value = None
     self.per_word_topics = None
     self.num_topics = None
     self.iterations = None
     self.random_state = None
     self.model = None
     self.coherence_model = None
     self.coherence = None
     self.coherence_type = None
Ejemplo n.º 55
0
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  # 所有频数超过10的词语的索引,(k->v)=>(v->k)
        f = open("word2index.txt", 'w', encoding='utf8')
        for key in w2indx:
            f.write(str(key))
            f.write(' ')
            f.write(str(w2indx[key]))
            f.write('\n')
        f.close()
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  # 所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)  # [[1,2,3...],[]]
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Ejemplo n.º 56
0
def get_corpus_and_dict(df, tokens_column, seen=True):
    """
    A Corpus is an iterable collection of Documents that your model is trained on. 
    - e.g. all news articles since 2018
    
    Dictionary is the vocabulary found in your corpus
    - e.g. Merriam Webster's dictionary
    
    We represent these tokens/words in bag-of-words format to optimize processing.
    """
    dictionary = Dictionary(documents=df['tokens'])
    df['bow'] = df['tokens'].apply(dictionary.doc2bow)
    corpus = list(df['bow'])

    return df, corpus, dictionary
Ejemplo n.º 57
0
class Vocab():
    def __init__(self):
        self.dic = Dictionary()
        self.dic.add_documents([[u'<UNK>']])

    def construct(self, input_file):
        f = codecs.open(input_file, 'r', 'utf-8')
        sentences = []
        for line in f:
            line = line.strip().split()
            sentences.append(line)
        self.dic.add_documents(sentences)
        f.close()
        self.dic.id2token = {v: k for k, v in self.dic.token2id.items()}

    def word2id(self, input_file, output_file):
        f = codecs.open(input_file, 'r', 'utf-8')
        g = open(output_file, 'w')
        for line in f:
            line = line.strip().split()
            line = map(lambda x: str(self.dic.token2id[x]), line)
            line = u" ".join(line) + u"\n"
            g.write(line)
        f.close()
        g.close()

    def id2word(self, input_file, output_file):
        f = open(input_file, 'r')
        g = codecs.open(output_file, 'w', 'utf-8')
        for line in f:
            line = line.strip().split()
            line = map(lambda x: self.dic.id2token.get(int(x), u'#'), line)
            line = u" ".join(line) + u"\n"
            g.write(line)
        f.close()
        g.close()
Ejemplo n.º 58
0
def word2vec_train(tokenizedtalkfile, vocabularyfile):
    wordlist = []
    for line in open(tokenizedtalkfile, 'r'):
        talkwords = []
        for word in line.split(' '):
            if word.find('\n') != -1:
                word = word.replace('\n', '')
            talkwords.append(word)
        wordlist.append(talkwords)
    print('Start Training ...')
    start = time.time()
    model = Word2Vec(size=50, min_count=1, window=7, workers=4, sg=1, iter=5)
    model.build_vocab(wordlist)
    model.train(wordlist)
    model.save('corpus_word2vec_model.pkl')
    end = time.time()
    print('Training Time: %.5f' % (end - start))
    model = Word2Vec.load('corpus_word2vec_model.pkl')
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
    word2index = {v: k for k, v in gensim_dict.items()}
    with open(vocabularyfile, 'w') as vocabFile:
        for item in word2index.keys():
            vocabFile.write(item + '\t' + str(word2index[item]) + '\n')
Ejemplo n.º 59
0
def further_preprocessing_phase(temp_data_frame):
    temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '')
    # textlist = temp_data_frame['text'].to_numpy()
    textlist = temp_data_frame['text'].tolist()

    # if it raises an exeption could be the empty texts
    patent_dictionary = Dictionary(textlist)
    corpus = [patent_dictionary.doc2bow(text) for text in textlist]

    print('original dictionary size: ', len(patent_dictionary))

    vocab_tf={}
    for i in corpus:
        for item, count in dict(i).items():
            if item in vocab_tf:
                vocab_tf[item]+=int(count)
            else:
                vocab_tf[item] =int(count)

    remove_ids=[]
    no_of_ids_below_limit=0
    for id,count in vocab_tf.items():
        if count<=5:
            remove_ids.append(id)
    patent_dictionary.filter_tokens(bad_ids=remove_ids)

    patent_dictionary.filter_extremes(no_below=0)
    patent_dictionary.filter_n_most_frequent(30)

    print('parsed dictionary size: ', len(patent_dictionary))

    vocabulary = list(patent_dictionary.token2id.keys())

    ids_list = []
    data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification'])
    temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1)
    print(len(ids_list))
    data_frame.set_index(data_frame['patent_id'], inplace=True)
    data_frame.drop(ids_list, axis=0, inplace=True)
    return data_frame
Ejemplo n.º 60
0
 def score(self, X, y=None, sample_weight=None) -> float:
     # TODO this needs further testing for correctness, WIP
     if self.autoencoder is None:
         raise NotFittedError
     self.autoencoder.eval()
     corpus = Sparse2Corpus(X, documents_columns=False)
     decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu()
     id2word = {index: str(index) for index in range(X.shape[1])}
     topics = [[str(item.item()) for item in topic]
               for topic in decoder_weight.topk(
                   min(self.score_num, X.shape[1]), dim=0)[1].t()]
     cm = CoherenceModel(topics=topics,
                         corpus=corpus,
                         dictionary=Dictionary.from_corpus(corpus, id2word),
                         coherence='u_mass')
     return cm.get_coherence()