def querySummary(self,sf,url):
        """
        This will be replaced by database call to summary-master table
        """
        uu = utils()
        try:
            sumbyUrl = sf.groupby('url')
            ssm = sumbyUrl.get_group(url)['summary'].values[0]
            ssm = ''.join(s for s in ssm)
            ssm = genu.any2utf8(ssm)
            return sf, uu.cleanSummary(ssm)

        except KeyError:
            """
            when key error happen we can do as follows:
            either fetch the data at run time and insert in table
            or
            we can for now ignore those urls and send them to a job which
            will take the urls in a que and fetch & insrt their summary in a night job

            """

            print 'url summary not in table create one and insert in table also'
            newSum = ''.join(w for w in self.getSummary(url))
            newSum = genu.any2utf8( newSum)
            newSum =  uu.cleanSummary(newSum)

            #write program to insert into summary data frame
            sf2 = pd.DataFrame([[url,newSum]], columns=['url','summary'])
            sf = sf.append(sf2, ignore_index=True)
            sf.to_pickle(path+'summary.pkl')
            sf = pd.io.pickle.read_pickle(path+'summary.pkl')
            return sf, newSum
Example #2
0
    def analyze_sentence(self, sentence, threshold, common_terms, scorer,custom_bigrams,ignore_list):
        """Analyze a sentence

        `sentence` a token list representing the sentence to be analyzed.

        `threshold` the minimum score for a bigram to be taken into account

        `common_terms` the list of common terms, they have a special treatment

        `scorer` the scorer function, as given to Phrases
        """
        s = [utils.any2utf8(w) for w in sentence]
        last_uncommon = None
        in_between = []
        my_phrases = custom_bigrams
        ignore_list = ignore_list
        

        #print "my phrase is ",my_phrases
        
        # adding None is a trick that helps getting an automatic happy ending
        # has it won't be a common_word, nor score
        for word in s + [None]:
            is_common = word in common_terms
            if not is_common and last_uncommon:
                chain = [last_uncommon] + in_between + [word]
                #print chain
                # test between last_uncommon
                score = self.score_item(
                    worda=last_uncommon,
                    wordb=word,
                    components=chain,
                    scorer=scorer,
                )
                if ignore_list and (any(x in ignore_list for x in chain)):
                    #print chain
                    if chain not in my_phrases:
                        score = threshold - 1
                        #print chain
                        #sys.exit()
                        #continue
                if score > threshold or chain in my_phrases:
                    #print "got it"
                    yield (chain, score)
                    last_uncommon = None
                    in_between = []
                else:
                    # release words individually
                    for w in it.chain([last_uncommon], in_between):
                        yield (w, None)
                    in_between = []
                    last_uncommon = word
            elif not is_common:
                last_uncommon = word
            else:  # common term
                if last_uncommon:
                    # wait for uncommon resolution
                    in_between.append(word)
                else:
                    yield (word, None)
Example #3
0
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_').

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for sentence in phrases[sentences]:
          ...     print(u' '.join(s))
            he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete
            nonviolence leo_tolstoy

        """
        warnings.warn(
            "For a faster implementation, use the gensim.models.phrases.Phraser class"
        )

        is_single, sentence = _is_single(sentence)
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        s, new_s = [utils.any2utf8(w) for w in sentence], []
        last_bigram = False
        vocab = self.vocab
        threshold = self.threshold
        delimiter = self.delimiter
        min_count = self.min_count
        for word_a, word_b in zip(s, s[1:]):
            if word_a in vocab and word_b in vocab:
                bigram_word = delimiter.join((word_a, word_b))
                if bigram_word in vocab and not last_bigram:
                    pa = float(vocab[word_a])
                    pb = float(vocab[word_b])
                    pab = float(vocab[bigram_word])
                    score = (pab - min_count) / pa / pb * len(vocab)
                    # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                    #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                    if score > threshold:
                        new_s.append(bigram_word)
                        last_bigram = True
                        continue

            if not last_bigram:
                new_s.append(word_a)
            last_bigram = False

        if s:  # add last word skipped by previous loop
            last_token = s[-1]
            if not last_bigram:
                new_s.append(last_token)

        return [utils.to_unicode(w) for w in new_s]
Example #4
0
    def learn_vocab(self, sentences):

        self.total_words = 0
        logger.info("collecting all words and their counts")
        self.vocab = defaultdict(basestring)
        #pdb.set_trace()
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types"
                    % (sentence_no, self.total_words, len(self.vocab)))
            sentence = [utils.any2utf8(w) for w in sentence]
            #pdb.set_trace()
            for word_a, word_b in zip(sentence, sentence[1:]):
                word_a = re.sub("[^a-zA-Z]+", "", word_a)
                word_b = re.sub("[^a-zA-Z]+", "", word_b)
                if not word_a: continue
                if self.predict_bigram(word_a, word_b):
                    phrase = word_a + "_" + word_b
                    self.add_vocab(phrase)
                self.add_vocab(word_a)

    #pdb.set_trace()
            if sentence:  # add last word skipped by previous loop
                word = sentence[-1]
                word = re.sub("[^a-zA-Z]+", "", word)
                if not word: continue
                self.add_vocab(word)

        logger.info(
            "collected %i word types from a corpus of %i words ( unigram ) and %i sentences"
            % (len(self.vocab), self.total_words, sentence_no + 1))
Example #5
0
	def learn_vocab(self,sentences ):
        	
        	self.total_words = 0
        	logger.info("collecting all words and their counts")
        	self.vocab = defaultdict(basestring)
		#pdb.set_trace()
        	for sentence_no, sentence in enumerate(sentences):
            		if sentence_no % 10000 == 0:
                		logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            		(sentence_no, self.total_words, len(self.vocab)))	
            		sentence = [utils.any2utf8(w) for w in sentence]
			#pdb.set_trace()
            		for word_a, word_b in zip(sentence, sentence[1:]):
				word_a = re.sub("[^a-zA-Z]+", "", word_a )
				word_b = re.sub("[^a-zA-Z]+", "", word_b )
				if not word_a: continue
				if self.predict_bigram(word_a,word_b):
					phrase = word_a + "_" + word_b
					self.add_vocab(phrase)
				self.add_vocab(word_a)
			
			#pdb.set_trace()
            		if sentence:    # add last word skipped by previous loop
                		word = sentence[-1]
				word = re.sub("[^a-zA-Z]+", "", word )
				if not word: continue
				self.add_vocab(word)

        	logger.info("collected %i word types from a corpus of %i words ( unigram ) and %i sentences" %
                    		(len(self.vocab), self.total_words, sentence_no + 1))
Example #6
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_'):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            (sentence_no, total_words, len(vocab)))
            sentence = [utils.any2utf8(w) for w in sentence]
            for bigram in zip(sentence, sentence[1:]):
                vocab[bigram[0]] += 1
                vocab[delimiter.join(bigram)] += 1
                total_words += 1

            if sentence:  # add last word skipped by previous loop
                word = sentence[-1]
                vocab[word] += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
                    (len(vocab), total_words, sentence_no + 1))
        return min_reduce, vocab
Example #7
0
    def export_phrases(self, sentences):
        """
        Generate an iterator that contains all phrases in given 'sentences'

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for phrase, score in bigram.export_phrases(sentences):
          ...     print(u'{0}\t{1}'.format(phrase, score))

            then you can debug the threshold with generated tsv
        """
        for sentence in sentences:
            s = [utils.any2utf8(w) for w in sentence]
            last_bigram = False
            vocab = self.vocab
            threshold = self.threshold
            delimiter = self.delimiter
            min_count = self.min_count
            for word_a, word_b in zip(s, s[1:]):
                if word_a in vocab and word_b in vocab:
                    bigram_word = delimiter.join((word_a, word_b))
                    if bigram_word in vocab and not last_bigram:
                        pa = float(vocab[word_a])
                        pb = float(vocab[word_b])
                        pab = float(vocab[bigram_word])
                        score = (pab - min_count) / pa / pb * len(vocab)
                        # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                        #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                        if score > threshold:
                            yield (b' '.join((word_a, word_b)), score)
                            last_bigram = True
Example #8
0
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_').

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for sentence in phrases[sentences]:
          ...     print(u' '.join(s))
            he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete
            nonviolence leo_tolstoy

        """
        warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
        try:
            is_single = not sentence or isinstance(sentence[0], string_types)
        except:
            is_single = False
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        s, new_s = [utils.any2utf8(w) for w in sentence], []
        last_bigram = False
        vocab = self.vocab
        threshold = self.threshold
        delimiter = self.delimiter
        min_count = self.min_count
        for word_a, word_b in zip(s, s[1:]):
            if word_a in vocab and word_b in vocab:
                bigram_word = delimiter.join((word_a, word_b))
                if bigram_word in vocab and not last_bigram:
                    pa = float(vocab[word_a])
                    pb = float(vocab[word_b])
                    pab = float(vocab[bigram_word])
                    score = (pab - min_count) / pa / pb * len(vocab)
                    # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                    #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                    if score > threshold:
                        new_s.append(bigram_word)
                        last_bigram = True
                        continue

            if not last_bigram:
                new_s.append(word_a)
            last_bigram = False

        if s:  # add last word skipped by previous loop
            last_token = s[-1]
            if not last_bigram:
                new_s.append(last_token)

        return [utils.to_unicode(w) for w in new_s]
Example #9
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            (sentence_no, total_words, len(vocab)))
            sentence = [utils.any2utf8(w) for w in sentence]
            for bigram in zip(sentence, sentence[1:]):
                vocab[bigram[0]] += 1
                vocab[delimiter.join(bigram)] += 1
                total_words += 1

            if sentence:  # add last word skipped by previous loop
                word = sentence[-1]
                vocab[word] += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
                    (len(vocab), total_words, sentence_no + 1))
        return min_reduce, vocab
 def __iter__(self):
     with utils.smart_open(self.fname) as fin:
         for sentences in fin:
             a=[]
             a.append( sentences[:sentences.find(',')])
             b= sentences[sentences.rfind(', ')+2: sentences.find(' .')]
             b = [utils.any2utf8(w) for w in b.split(" ")]
             yield (b,a)
Example #11
0
    def analyze_sentence(self, sentence, threshold, common_terms, scorer):
        """Analyze a sentence, detecting any bigrams that should be concatenated.

        Parameters
        ----------
        sentence : iterable of str
            Token sequence representing the sentence to be analyzed.
        threshold : float
            The minimum score for a bigram to be taken into account.
        common_terms : list of object
            List of common terms, they receive special treatment.
        scorer : function
            Scorer function, as given to :class:`~gensim.models.phrases.Phrases`.
            See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`.

        Yields
        ------
        (str, score)
            If bi-gram detected, a tuple where the first element is a detect bigram, second its score.
            Otherwise, the first tuple element is a single word and second is None.

        """
        s = [utils.any2utf8(w) for w in sentence]
        # adding None is a trick that helps getting an automatic happy ending
        # as it won't be a common_word, nor score
        s.append(None)
        last_uncommon = None
        in_between = []
        for word in s:
            is_common = word in common_terms
            if not is_common and last_uncommon:
                chain = [last_uncommon] + in_between + [word]
                # test between last_uncommon
                score = self.score_item(
                    worda=last_uncommon,
                    wordb=word,
                    components=chain,
                    scorer=scorer,
                )
                if score > threshold:
                    yield (chain, score)
                    last_uncommon = None
                    in_between = []
                else:
                    # release words individually
                    for w in it.chain([last_uncommon], in_between):
                        yield (w, None)
                    in_between = []
                    last_uncommon = word
            elif not is_common:
                last_uncommon = word
            else:  # common term
                if last_uncommon:
                    # wait for uncommon resolution
                    in_between.append(word)
                else:
                    yield (word, None)
Example #12
0
    def analyze_sentence(self, sentence, threshold, common_terms, scorer):
        """Analyze a sentence, detecting any bigrams that should be concatenated.

        Parameters
        ----------
        sentence : iterable of str
            Token sequence representing the sentence to be analyzed.
        threshold : float
            The minimum score for a bigram to be taken into account.
        common_terms : list of object
            List of common terms, they receive special treatment.
        scorer : function
            Scorer function, as given to :class:`~gensim.models.phrases.Phrases`.
            See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`.

        Yields
        ------
        (str, score)
            If bi-gram detected, a tuple where the first element is a detect bigram, second its score.
            Otherwise, the first tuple element is a single word and second is None.

        """
        s = [utils.any2utf8(w) for w in sentence]
        # adding None is a trick that helps getting an automatic happy ending
        # as it won't be a common_word, nor score
        s.append(None)
        last_uncommon = None
        in_between = []
        for word in s:
            is_common = word in common_terms
            if not is_common and last_uncommon:
                chain = [last_uncommon] + in_between + [word]
                # test between last_uncommon
                score = self.score_item(
                    worda=last_uncommon,
                    wordb=word,
                    components=chain,
                    scorer=scorer,
                )
                if score > threshold:
                    yield (chain, score)
                    last_uncommon = None
                    in_between = []
                else:
                    # release words individually
                    for w in it.chain([last_uncommon], in_between):
                        yield (w, None)
                    in_between = []
                    last_uncommon = word
            elif not is_common:
                last_uncommon = word
            else:  # common term
                if last_uncommon:
                    # wait for uncommon resolution
                    in_between.append(word)
                else:
                    yield (word, None)
Example #13
0
    def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
        """
        Generate an iterator that contains all phrases in given 'sentences'

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for phrase, score in bigram.export_phrases(sentences):
          ...     print(u'{0}\t{1}'.format(phrase, score))

            then you can debug the threshold with generated tsv
        """

        vocab = self.vocab
        threshold = self.threshold
        delimiter = self.delimiter  # delimiter used for lookup
        min_count = self.min_count
        scoring = self.scoring
        corpus_word_count = self.corpus_word_count

        if scoring == 'default':
            scoring_function = \
            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
        elif scoring == 'npmi':
            scoring_function = \
            partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
        # no else here to catch unknown scoring function, check is done in Phrases.__init__

        for sentence in sentences:
            s = [utils.any2utf8(w) for w in sentence]
            last_bigram = False

            for word_a, word_b in zip(s, s[1:]):
                # last bigram check was moved here to save a few CPU cycles
                if word_a in vocab and word_b in vocab and not last_bigram:
                    bigram_word = delimiter.join((word_a, word_b))
                    if bigram_word in vocab:
                        count_a = float(vocab[word_a])
                        count_b = float(vocab[word_b])
                        count_ab = float(vocab[bigram_word])
                        score = scoring_function(count_a, count_b, count_ab)
                        # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                        #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                        # added mincount check because if the scorer doesn't contain min_count
                        # it would not be enforced otherwise
                        if score > threshold and count_ab >= min_count:
                            if as_tuples:
                                yield ((word_a, word_b), score)
                            else:
                                yield (out_delimiter.join(
                                    (word_a, word_b)), score)
                            last_bigram = True
                            continue
                last_bigram = False
Example #14
0
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_').

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for sentence in phrases[sentences]:
          ...     print(u' '.join(s))
            he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete
            nonviolence leo_tolstoy

        """
        try:
            is_single = not sentence or isinstance(sentence[0], string_types)
        except:
            is_single = False
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        s, new_s = [utils.any2utf8(w) for w in sentence], []
        last_bigram = False
        for bigram in zip(s, s[1:]):
            if all(uni in self.vocab for uni in bigram):
                bigram_word = self.delimiter.join(bigram)
                if bigram_word in self.vocab and not last_bigram:
                    pa = float(self.vocab[bigram[0]])
                    pb = float(self.vocab[bigram[1]])
                    pab = float(self.vocab[bigram_word])
                    score = (pab - self.min_count) / pa / pb * len(self.vocab)
                    # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                    #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                    if score > self.threshold:
                        new_s.append(bigram_word)
                        last_bigram = True
                        continue

            if not last_bigram:
                new_s.append(bigram[0])
            last_bigram = False

        if s:  # add last word skipped by previous loop
            last_token = s[-1]
            if last_token in self.vocab and not last_bigram:
                new_s.append(last_token)

        return [utils.to_unicode(w) for w in new_s]
Example #15
0
    def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
        """
        Generate an iterator that contains all phrases in given 'sentences'

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for phrase, score in bigram.export_phrases(sentences):
          ...     print(u'{0}\t{1}'.format(phrase, score))

            then you can debug the threshold with generated tsv
        """

        vocab = self.vocab
        threshold = self.threshold
        delimiter = self.delimiter  # delimiter used for lookup
        min_count = self.min_count
        scoring = self.scoring
        corpus_word_count = self.corpus_word_count

        if scoring == 'default':
            scoring_function = \
            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
        elif scoring == 'npmi':
            scoring_function = \
            partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
        # no else here to catch unknown scoring function, check is done in Phrases.__init__

        for sentence in sentences:
            s = [utils.any2utf8(w) for w in sentence]
            last_bigram = False

            for word_a, word_b in zip(s, s[1:]):
                # last bigram check was moved here to save a few CPU cycles
                if word_a in vocab and word_b in vocab and not last_bigram:
                    bigram_word = delimiter.join((word_a, word_b))
                    if bigram_word in vocab:
                        count_a = float(vocab[word_a])
                        count_b = float(vocab[word_b])
                        count_ab = float(vocab[bigram_word])
                        score = scoring_function(count_a, count_b, count_ab)
                        # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                        #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                        # added mincount check because if the scorer doesn't contain min_count
                        # it would not be enforced otherwise
                        if score > threshold and count_ab >= min_count:
                            if as_tuples:
                                yield ((word_a, word_b), score)
                            else:
                                yield (out_delimiter.join((word_a, word_b)), score)
                            last_bigram = True
                            continue
                last_bigram = False
Example #16
0
    def test_cython_linesentence_readline_after_getting_offsets(self):
        lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n']
        tmpf = get_tmpfile('gensim_doc2vec.tst')

        with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout:
            for line in lines:
                fout.write(utils.any2unicode(line))

        from gensim.models.word2vec_corpusfile import CythonLineSentence

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5)
        for offset, line in zip(offsets, lines):
            ls = CythonLineSentence(tmpf, offset)
            sentence = ls.read_sentence()
            self.assertEqual(len(sentence), 1)
            self.assertEqual(sentence[0], utils.any2utf8(line.strip()))
Example #17
0
    def analyze_sentence(self, sentence, threshold, common_terms, scorer):
        """Analyze a sentence

        `sentence` a token list representing the sentence to be analyzed.

        `threshold` the minimum score for a bigram to be taken into account

        `common_terms` the list of common terms, they have a special treatment

        `scorer` the scorer function, as given to Phrases
        """
        s = [utils.any2utf8(w) for w in sentence]
        last_uncommon = None
        in_between = []
        # adding None is a trick that helps getting an automatic happy ending
        # has it won't be a common_word, nor score
        for word in s + [None]:
            is_common = word in common_terms
            if not is_common and last_uncommon:
                chain = [last_uncommon] + in_between + [word]
                # test between last_uncommon
                score = self.score_item(
                    worda=last_uncommon,
                    wordb=word,
                    components=chain,
                    scorer=scorer,
                )
                if score > threshold:
                    yield (chain, score)
                    last_uncommon = None
                    in_between = []
                else:
                    # release words individually
                    for w in it.chain([last_uncommon], in_between):
                        yield (w, None)
                    in_between = []
                    last_uncommon = word
            elif not is_common:
                last_uncommon = word
            else:  # common term
                if last_uncommon:
                    # wait for uncommon resolution
                    in_between.append(word)
                else:
                    yield (word, None)
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_'
        (or other configured delimiter-character).

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        """
        try:
            is_single = not sentence or isinstance(sentence[0], string_types)
        except:
            is_single = False
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        s, new_s = [utils.any2utf8(w) for w in sentence], []
        last_bigram = False
        phrasegrams = self.phrasegrams
        delimiter = self.delimiter
        for word_a, word_b in zip(s, s[1:]):
            bigram_tuple = (word_a, word_b)
            if phrasegrams.get(
                    bigram_tuple,
                (-1, -1))[1] > self.threshold and not last_bigram:
                bigram_word = delimiter.join((word_a, word_b))
                new_s.append(bigram_word)
                last_bigram = True
                continue

            if not last_bigram:
                new_s.append(word_a)
            last_bigram = False

        if s:  # add last word skipped by previous loop
            last_token = s[-1]
            if not last_bigram:
                new_s.append(last_token)

        return [utils.to_unicode(w) for w in new_s]
Example #19
0
    def prep_text(self, p_num, sentences, outfile):
        output = open(outfile, 'w')
        #distribute textfiles
        for i, sen in enumerate(sentences):
            sentence = [utils.any2utf8(w) for w in sen]
            for word_a, word_b in zip(sentence, sentence[1:]):
                word_a = re.sub("[^a-zA-Z]+", "", word_a)
                word_b = re.sub("[^a-zA-Z]+", "", word_b)
                if not word_a: continue
                phrase = word_a + "_" + word_b
                if phrase in self.vocab:
                    output.write(utils.to_utf8(self.vocab[phrase] + ' '))
                else:
                    output.write(utils.to_utf8(self.vocab[word_a] + ' '))

            if i % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i " % (i))
        logger.info("PROGRESS: at sentence #%i " % (i))
        output.close()
Example #20
0
	def prep_text(self,p_num, sentences,outfile):
		output = open(outfile,'w')
		#distribute textfiles
		for i,sen in enumerate(sentences):
			sentence = [utils.any2utf8(w) for w in sen ]
			for word_a, word_b in zip(sentence, sentence[1:]):
				word_a = re.sub("[^a-zA-Z]+", "", word_a )
				word_b = re.sub("[^a-zA-Z]+", "", word_b )
				if not word_a: continue
				phrase = word_a + "_" + word_b;
        		        if phrase in self.vocab:
					output.write(utils.to_utf8(self.vocab[phrase]+' '))
				else:
					output.write(utils.to_utf8(self.vocab[word_a]+' '))
		
			if i % 10000 == 0:
        			logger.info("PROGRESS: at sentence #%i " %(i))
        	logger.info("PROGRESS: at sentence #%i " %(i))
    		output.close()
    def learn_vocab(sentences,
                    max_vocab_size,
                    delimiter=b'_',
                    progress_per=10000,
                    common_terms=frozenset()):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no,
                    total_words,
                    len(vocab),
                )
            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between,
                                              [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1)
        return min_reduce, vocab, total_words
Example #22
0
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_'
        (or other configured delimiter-character).

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        """
        try:
            is_single = not sentence or isinstance(sentence[0], string_types)
        except:
            is_single = False
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        s, new_s = [utils.any2utf8(w) for w in sentence], []
        last_bigram = False
        phrasegrams = self.phrasegrams
        delimiter = self.delimiter
        for word_a, word_b in zip(s, s[1:]):
            bigram_tuple = (word_a, word_b)
            if phrasegrams.get(bigram_tuple, (-1, -1))[1] > self.threshold and not last_bigram:
                bigram_word = delimiter.join((word_a, word_b))
                new_s.append(bigram_word)
                last_bigram = True
                continue

            if not last_bigram:
                new_s.append(word_a)
            last_bigram = False

        if s:  # add last word skipped by previous loop
            last_token = s[-1]
            if not last_bigram:
                new_s.append(last_token)

        return [utils.to_unicode(w) for w in new_s]
Example #23
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
                    common_terms=frozenset()):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no, total_words, len(vocab),
                )
            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between, [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1
        )
        return min_reduce, vocab, total_words
    def insertSummary(self,urls):
        old = pd.io.pickle.read_pickle(path+'summary.pkl')
        sumByUrl = old.groupby('url')
        uu = utils()

        sumList = []
        for url in urls:
            try:
                ssm = sumByUrl.get_group(url)['summary'].values[0]
                sumList.append([url,ssm])
            except KeyError:
                newSum = ''.join(w for w in self.getSummary(url))
                newSum = genu.any2utf8( newSum)
                newSum =  uu.cleanSummary(newSum)
                if len(newSum)<100:
                    print 'dropping'+newSum +' of url '+url
                else:
                    print url +' summary not in table create one and insert in table also'
                    sumList.append([url,newSum])
        if len(sumList) > 0:
            sf = pd.DataFrame(sumList, columns=['url','summary'])
            sumDf = old.append(sf, ignore_index=True)
            sumDf.to_pickle(path+'summary.pkl')
        return sumList
Example #25
0
def get_custom_stop_words():
    spacy_stop_words = set(nlp.Defaults.stop_words)
    custom_stop_words = set(
        ['a', 'agora', 'ainda', 'alem', 'algum', 'alguma', 'algumas', 'alguns', 'alguém', 'além', 'ambas', 'ambos',
         'ampla', \
         'amplas', 'amplo', 'amplos', 'and', 'ante', 'antes', 'ao', 'aonde', 'aos', 'apos', 'após', 'aquela',
         'aquelas', \
         'aquele', 'aqueles', 'aquilo', 'as', 'assim', 'através', 'até', 'cada', 'coisa', 'coisas', 'com', 'como',
         'contra', \
         'contudo', 'cuja', 'cujas', 'cujo', 'cujos', 'côm', 'da', 'daquele', 'daqueles', 'das', 'data', 'de',
         'dela', 'delas', \
         'dele', 'deles', 'demais', 'depois', 'desde', 'dessa', 'dessas', 'desse', 'desses', 'desta', 'destas',
         'deste', \
         'destes', 'deve', 'devem', 'devendo', 'dever', 'deveria', 'deveriam', 'deverá', 'deverão', 'devia',
         'deviam', \
         'dispoe', 'dispoem', 'dispõe', 'dispõem', 'disse', 'disso', 'disto', 'dito', 'diversa', 'diversas',
         'diversos', 'diz', \
         'dizem', 'do', 'dos', 'durante', 'dà', 'dàs', 'dá', 'dás', 'dê', 'e', 'ela', 'elas', 'ele', 'eles', 'em',
         'enquanto', \
         'entao', 'entre', 'então', 'era', 'eram', 'essa', 'essas', 'esse', 'esses', 'esta', 'estamos', 'estas',
         'estava', \
         'estavam', 'este', 'esteja', 'estejam', 'estejamos', 'estes', 'esteve', 'estive', 'estivemos', 'estiver',
         'estivera', \
         'estiveram', 'estiverem', 'estivermos', 'estivesse', 'estivessem', 'estivéramos', 'estivéssemos', 'estou',
         'està', \
         'estàs', 'está', 'estás', 'estávamos', 'estão', 'eu', 'fazendo', 'fazer', 'feita', 'feitas', 'feito',
         'feitos', 'foi', \
         'fomos', 'for', 'fora', 'foram', 'forem', 'formos', 'fosse', 'fossem', 'fui', 'fôramos', 'fôssemos',
         'grande', \
         'grandes', 'ha', 'haja', 'hajam', 'hajamos', 'havemos', 'havia', 'hei', 'houve', 'houvemos', 'houver',
         'houvera', \
         'houveram', 'houverei', 'houverem', 'houveremos', 'houveria', 'houveriam', 'houvermos', 'houverá',
         'houverão', \
         'houveríamos', 'houvesse', 'houvessem', 'houvéramos', 'houvéssemos', 'há', 'hão', 'isso', 'isto', 'já',
         'la', 'lhe', \
         'lhes', 'lo', 'logo', 'lá', 'mais', 'mas', 'me', 'mediante', 'menos', 'mesma', 'mesmas', 'mesmo', 'mesmos',
         'meu', 'meus', \
         'minha', 'minhas', 'muita', 'muitas', 'muito', 'muitos', 'nº', 'na', 'nas', 'nem', 'nenhum', 'nessa',
         'nessas',
         'nesse', \
         'nesta', 'nestas', 'neste', 'ninguém', 'no', 'nos', 'nossa', 'nossas', 'nosso', 'nossos', 'num', 'numa',
         'nunca', 'ná', \
         'nás', 'não', 'nós', 'o', 'or', 'os', 'ou', 'outra', 'outras', 'outro', 'outros', 'para', 'pela', 'pelas',
         'pelo', 'pelos', \
         'pequena', 'pequenas', 'pequeno', 'pequenos', 'per', 'perante', 'pode', 'podendo', 'poder', 'poderia',
         'poderiam', \
         'podia', 'podiam', 'pois', 'por', 'porque', 'porquê', 'portanto', 'porém', 'posso', 'pouca', 'poucas',
         'pouco', 'poucos', \
         'primeiro', 'primeiros', 'proprio', 'própria', 'próprias', 'próprio', 'próprios', 'pôde', 'quais', 'qual',
         'qualquer', \
         'quando', 'quanto', 'quantos', 'quaís', 'que', 'quem', 'quer', 'quê', 'se', 'seja', 'sejam', 'sejamos',
         'sem', 'sempre', \
         'sendo', 'ser', 'serei', 'seremos', 'seria', 'seriam', 'será', 'serão', 'seríamos', 'seu', 'seus', 'si',
         'sido', 'sob', \
         'sobre', 'somos', 'sou', 'sua', 'suas', 'são', 'só', 'tal', 'talvez', 'tambem', 'também', 'tampouco', 'te',
         'tem', 'temos', \
         'tendo', 'tenha', 'tenham', 'tenhamos', 'tenho', 'ter', 'terei', 'teremos', 'teria', 'teriam', 'terá',
         'terão', 'teríamos', \
         'teu', 'teus', 'teve', 'ti', 'tido', 'tinha', 'tinham', 'tive', 'tivemos', 'tiver', 'tivera', 'tiveram',
         'tiverem', \
         'tivermos', 'tivesse', 'tivessem', 'tivéramos', 'tivéssemos', 'toda', 'todas', 'todavia', 'todo', 'todos',
         'tu', 'tua', \
         'tuas', 'tudo', 'tém', 'têm', 'tínhamos', 'um', 'uma', 'umas', 'uns', 'vendo', 'ver', 'vez', 'vindo', 'vir',
         'você', \
         'vocês', 'vos', 'vós', 'à', 'às', 'á', 'ás', 'ão', 'è', 'é', 'éramos', 'êm', 'ò', 'ó', 'õ', 'última',
         'últimas', 'último', \
         'últimos'])
    custom_stop_words.update(spacy_stop_words)
    return [any2utf8(stop_word) for stop_word in custom_stop_words]
Example #26
0
    def __init__(self,
                 sentences=None,
                 min_count=5,
                 threshold=10.0,
                 max_vocab_size=40000000,
                 delimiter=b'_',
                 progress_per=10000,
                 scoring='default',
                 common_terms=frozenset(),
                 doc2vec=False):
        """
        Parameters
        ----------
        sentences : iterable of list of str, optional
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        min_count : float, optional
            Ignore all words and bigrams with total collected count lower than this value.
        threshold : float, optional
            Represent a score threshold for forming the phrases (higher means fewer phrases).
            A phrase of words `a` followed by `b` is accepted if the score of the phrase is greater than threshold.
            Heavily depends on concrete scoring-function, see the `scoring` parameter.
        max_vocab_size : int, optional
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        scoring : {'default', 'npmi', function}, optional
            Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a
            built-in scoring function, or with a function with the expected parameter names.
            Two built-in scoring functions are available by setting `scoring` to a string:
            #. "default" - :func:`~gensim.models.phrases.original_scorer`.
            #. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".
        Notes
        -----
        'npmi' is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default. The default is the PMI-like scoring
        as described by `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
        <https://arxiv.org/abs/1310.4546>`_.
        To use a custom scoring function, pass in a function with the following signature:
        * worda_count - number of corpus occurrences in `sentences` of the first token in the bigram being scored
        * wordb_count - number of corpus occurrences in `sentences` of the second token in the bigram being scored
        * bigram_count - number of occurrences in `sentences` of the whole bigram
        * len_vocab - the number of unique tokens in `sentences`
        * min_count - the `min_count` setting of the Phrases class
        * corpus_word_count - the total number of tokens (non-unique) in `sentences`
        The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring.
        The scoring function **must be pickleable**.
        """
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError(
                "threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError(
                "threshold should be between -1 and 1 for npmi scoring")

        # set scoring based on string
        # intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block

        if isinstance(scoring, six.string_types):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError('unknown scoring method string %s specified' %
                                 (scoring))

        scoring_parameters = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab',
            'min_count', 'corpus_word_count'
        ]
        if callable(scoring):
            if all(parameter in getargspec(scoring)[0]
                   for parameter in scoring_parameters):
                self.scoring = scoring
            else:
                raise ValueError(
                    'scoring function missing expected parameters')

        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(
            int)  # mapping between utf8 token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0
        self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)
        self.doc2vec = doc2vec

        # ensure picklability of custom scorer
        try:
            test_pickle = pickle.dumps(self.scoring)
            load_pickle = pickle.loads(test_pickle)
        except pickle.PickleError:
            raise pickle.PickleError(
                'unable to pickle custom Phrases scoring function')
        finally:
            del (test_pickle)
            del (load_pickle)

        if sentences is not None:
            self.add_vocab(sentences)
Example #27
0
    def learn_vocab(sentences,
                    max_vocab_size,
                    delimiter=b'_',
                    progress_per=10000,
                    common_terms=frozenset(),
                    doc2vec=False):
        """Collect unigram/bigram counts from the `sentences` iterable.
        Parameters
        ----------
        sentences : iterable of list of str
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        max_vocab_size : int
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        progress_per : int
            Write logs every `progress_per` sentence.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".
        Return
        ------
        (int, dict of (str, int), int)
            Number of pruned words, counters for each word/bi-gram and total number of words.
        Example
        ----------
        >>> from gensim.test.utils import datapath
        >>> from gensim.models.word2vec import Text8Corpus
        >>> from gensim.models.phrases import Phrases
        >>>
        >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
        >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100)
        >>> (pruned_words, total_words)
        (1, 29)
        >>> counters['computer']
        2
        >>> counters['response_time']
        1
        """
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no,
                    total_words,
                    len(vocab),
                )

            if doc2vec:
                sentence = sentence.words

            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between,
                                              [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1)
        return min_reduce, vocab, total_words
Example #28
0
    def __init__(self, sentences=None, min_count=5, threshold=10.0,
                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
                 scoring='default', common_terms=frozenset()):
        """

        Parameters
        ----------
        sentences : iterable of list of str, optional
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        min_count : float, optional
            Ignore all words and bigrams with total collected count lower than this value.
        threshold : float, optional
            Represent a score threshold for forming the phrases (higher means fewer phrases).
            A phrase of words `a` followed by `b` is accepted if the score of the phrase is greater than threshold.
            Heavily depends on concrete scoring-function, see the `scoring` parameter.
        max_vocab_size : int, optional
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        scoring : {'default', 'npmi', function}, optional
            Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a
            built-in scoring function, or with a function with the expected parameter names.
            Two built-in scoring functions are available by setting `scoring` to a string:

            #. "default" - :func:`~gensim.models.phrases.original_scorer`.
            #. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".

        Notes
        -----
        'npmi' is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default. The default is the PMI-like scoring
        as described by `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
        <https://arxiv.org/abs/1310.4546>`_.

        To use a custom scoring function, pass in a function with the following signature:

        * worda_count - number of corpus occurrences in `sentences` of the first token in the bigram being scored
        * wordb_count - number of corpus occurrences in `sentences` of the second token in the bigram being scored
        * bigram_count - number of occurrences in `sentences` of the whole bigram
        * len_vocab - the number of unique tokens in `sentences`
        * min_count - the `min_count` setting of the Phrases class
        * corpus_word_count - the total number of tokens (non-unique) in `sentences`

        The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring.
        The scoring function **must be pickleable**.

        """
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError("threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError("threshold should be between -1 and 1 for npmi scoring")

        # set scoring based on string
        # intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block

        if isinstance(scoring, six.string_types):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError('unknown scoring method string %s specified' % (scoring))

        scoring_parameters = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'
        ]
        if callable(scoring):
            if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
                self.scoring = scoring
            else:
                raise ValueError('scoring function missing expected parameters')

        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(int)  # mapping between utf8 token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0
        self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)

        # ensure picklability of custom scorer
        try:
            test_pickle = pickle.dumps(self.scoring)
            load_pickle = pickle.loads(test_pickle)
        except pickle.PickleError:
            raise pickle.PickleError('unable to pickle custom Phrases scoring function')
        finally:
            del(test_pickle)
            del(load_pickle)

        if sentences is not None:
            self.add_vocab(sentences)
Example #29
0
        try:
            lines = codecs.open(dir + "/" + file, "r", "utf-8").readlines()
            print "utf8"
        except:
            utf8 = False
            pass
        if not utf8:
            print "utf16"
            try:
                lines = codecs.open(dir + "/" + file, "r",
                                    "utf-16").readlines()
            except:
                utf8 = False
                utf16 = True
                pass

        for line in lines:
            cor.write(utils.any2utf8(line))
cor.close()
dictionary = corpora.Dictionary(
    utils.any2utf8(line.lower()).split()
    for line in codecs.open("testdata/corpora/chinese/BAXCN_00007421.txt", "r",
                            'utf-16').readlines())
print "utf8 dictionary"
print dictionary.values()
dictionary.save("test.dict")
dictionary.save_as_text('dict.txt')
mydict = dictionary.load("test.dict")
print mydict
print dictionary
Example #30
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
                    common_terms=frozenset()):
        """Collect unigram/bigram counts from the `sentences` iterable.

        Parameters
        ----------
        sentences : iterable of list of str
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        max_vocab_size : int
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        progress_per : int
            Write logs every `progress_per` sentence.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".

        Return
        ------
        (int, dict of (str, int), int)
            Number of pruned words, counters for each word/bi-gram and total number of words.

        Example
        ----------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases
            >>>
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100)
            >>> (pruned_words, total_words)
            (1, 29)
            >>> counters['computer']
            2
            >>> counters['response_time']
            1

        """
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no, total_words, len(vocab),
                )
            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between, [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1
        )
        return min_reduce, vocab, total_words
Example #31
0
    def __init__(self, sentences=None, min_count=5, threshold=10.0,
                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
                 scoring='default', common_terms=frozenset(),custom_bigrams=None,ignore_list = None):
        """
        Initialize the model from an iterable of `sentences`. Each sentence must be
        a list of words (unicode strings) that will be used for training.

        The `sentences` iterable can be simply a list, but for larger corpora,
        consider a generator that streams the sentences directly from disk/network,
        without storing everything in RAM. See :class:`BrownCorpus`,
        :class:`Text8Corpus` or :class:`LineSentence` in the :mod:`gensim.models.word2vec`
        module for such examples.

        `min_count` ignore all words and bigrams with total collected count lower
        than this.

        `threshold` represents a score threshold for forming the phrases (higher means
        fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
        phrase is greater than threshold. see the `scoring` setting.

        `max_vocab_size` is the maximum size of the vocabulary. Used to control
        pruning of less common words, to keep memory under control. The default
        of 40M needs about 3.6GB of RAM; increase/decrease `max_vocab_size` depending
        on how much available memory you have.

        `delimiter` is the glue character used to join collocation tokens, and
        should be a byte string (e.g. b'_').

        `scoring` specifies how potential phrases are scored for comparison to the `threshold`
        setting. `scoring` can be set with either a string that refers to a built-in scoring function,
        or with a function with the expected parameter names. Two built-in scoring functions are available
        by setting `scoring` to a string:

        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
                   Mikolov, et. al.:
                   (count(worda followed by wordb) - min_count) * N /
                   (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
                Information in Colocation Extraction" by Gerlof Bouma:
                ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
                - ln(prop(worda followed by wordb)
                where prop(n) is the count of n / the count of everything in the entire corpus.

        'npmi' is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default.

        To use a custom scoring function, create a function with the following parameters and set the `scoring`
        parameter to the custom function. You must use all the parameters in your function call, even if the
        function does not require all the parameters.

            worda_count: number of occurrances in `sentences` of the first token in the phrase being scored
            wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored
            bigram_count: number of occurrances in `sentences` of the phrase being scored
            len_vocab: the number of unique tokens in `sentences`
            min_count: the `min_count` setting of the Phrases class
            corpus_word_count: the total number of (non-unique) tokens in `sentences`

        A scoring function without any of these parameters (even if the parameters are not used) will
        raise a ValueError on initialization of the Phrases class. The scoring function must be picklable.

        `common_terms` is an optionnal list of "stop words" that won't affect frequency count
        of expressions containing them.
        """
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError("threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError("threshold should be between -1 and 1 for npmi scoring")

        # set scoring based on string
        # intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block

        if isinstance(scoring, six.string_types):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError('unknown scoring method string %s specified' % (scoring))

        scoring_parameters = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'
        ]
        if callable(scoring):
            if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
                self.scoring = scoring
            else:
                raise ValueError('scoring function missing expected parameters')

        self.min_count = min_count
        self.ignore_list = ignore_list
        self.custom_bigrams = custom_bigrams
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(int)  # mapping between utf8 token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0
        self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)

        # ensure picklability of custom scorer
        try:
            test_pickle = pickle.dumps(self.scoring)
            load_pickle = pickle.loads(test_pickle)
        except pickle.PickleError:
            raise pickle.PickleError('unable to pickle custom Phrases scoring function')
        finally:
            del(test_pickle)
            del(load_pickle)

        if sentences is not None:
            self.add_vocab(sentences)
Example #32
0
import sys
from gensim.utils import any2utf8
import json
from pprint import pprint
import collections
import unicodedata
from collections import OrderedDict

f = open('output.txt', 'a+')

wiki_list = ["pakistan","france"]

for value in wiki_list:
    response = muterun_js('parser.js',value)


    if response.exitcode == 0: 
        result = response.stdout
        result = result.replace("{","").replace('"','').replace("}","").replace("\n","")
        print result
        #result = unicodedata.normalize('NFKD', unicode(result, "utf-8")).encode('ascii','ignore')
        #test = json.loads(response.stdout, object_pairs_hook= collections.OrderedDict)
        f.write(any2utf8(result))
        f.write("\n")
        #for key,value in test.iteritems():
        #    print key," : ",value
    
f.close()    
    

Example #33
0
tfidffile=sys.argv[2]
if tfidffile.endswith("tfidf.mm"):
	print "got corpus matrix, creating new tfidf model"
	corpus=corpora.MmCorpus(tfidffile)
	tfidf=models.TfidfModel(corpus,id2word=dictionary,normalize=True)
	tfidf.save(tfidffile.replace("tfidf.mm","tfidf"))
	print "saved tfidf model at "+ tfidffile.replace("tfidf.mm","tfidf")
else:
	print "using tfidf model at " + tfidffile
	tfidf=models.TfidfModel.load(tfidffile)
docs=[]
for dir,dirs,filenames in os.walk(sys.argv[3]):
	
	if not "/." in dir:
		for file in filenames:		
			path=dir+"/"+file
			if not path.startswith("."):
				doctfidf=tfidf[dictionary.doc2bow(utils.tokenize(utils.any2utf8(open(path,"r").read(),errors='ignore')))]
				doctfidf.sort(key=lambda tup:tup[1],reverse=True)
				docs.append((file,doctfidf))
tfidflog=open("tfidf.txt","w")
for file,doc in docs:
	for word in doc:
		tfidflog.write(file+","+tfidf.id2word[word[0]]+","+str(word[1])+"\n")	
tfidflog.close()
print "wrote results to tfidf.txt"	
#for term in tfidf.idfs:
#	print tfidf.id2word[term]+","+str(tfidf.idfs[term])
#	print tfidf[[(term,1)]]

Example #34
0
    def __init__(self, sentences=None, min_count=5, threshold=10.0,
                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
                 scoring='default', common_terms=frozenset()):
        """
        Initialize the model from an iterable of `sentences`. Each sentence must be
        a list of words (unicode strings) that will be used for training.

        The `sentences` iterable can be simply a list, but for larger corpora,
        consider a generator that streams the sentences directly from disk/network,
        without storing everything in RAM. See :class:`BrownCorpus`,
        :class:`Text8Corpus` or :class:`LineSentence` in the :mod:`gensim.models.word2vec`
        module for such examples.

        `min_count` ignore all words and bigrams with total collected count lower
        than this.

        `threshold` represents a score threshold for forming the phrases (higher means
        fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
        phrase is greater than threshold. see the `scoring` setting.

        `max_vocab_size` is the maximum size of the vocabulary. Used to control
        pruning of less common words, to keep memory under control. The default
        of 40M needs about 3.6GB of RAM; increase/decrease `max_vocab_size` depending
        on how much available memory you have.

        `delimiter` is the glue character used to join collocation tokens, and
        should be a byte string (e.g. b'_').

        `scoring` specifies how potential phrases are scored for comparison to the `threshold`
        setting. `scoring` can be set with either a string that refers to a built-in scoring function,
        or with a function with the expected parameter names. Two built-in scoring functions are available
        by setting `scoring` to a string:

        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
                   Mikolov, et. al.:
                   (count(worda followed by wordb) - min_count) * N /
                   (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
                Information in Colocation Extraction" by Gerlof Bouma:
                ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
                - ln(prop(worda followed by wordb)
                where prop(n) is the count of n / the count of everything in the entire corpus.

        'npmi' is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default.

        To use a custom scoring function, create a function with the following parameters and set the `scoring`
        parameter to the custom function. You must use all the parameters in your function call, even if the
        function does not require all the parameters.

            worda_count: number of occurrances in `sentences` of the first token in the phrase being scored
            wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored
            bigram_count: number of occurrances in `sentences` of the phrase being scored
            len_vocab: the number of unique tokens in `sentences`
            min_count: the `min_count` setting of the Phrases class
            corpus_word_count: the total number of (non-unique) tokens in `sentences`

        A scoring function without any of these parameters (even if the parameters are not used) will
        raise a ValueError on initialization of the Phrases class. The scoring function must be picklable.

        `common_terms` is an optionnal list of "stop words" that won't affect frequency count
        of expressions containing them.
        """
        if min_count <= 0:
            raise ValueError("min_count should be at least 1")

        if threshold <= 0 and scoring == 'default':
            raise ValueError("threshold should be positive for default scoring")
        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
            raise ValueError("threshold should be between -1 and 1 for npmi scoring")

        # set scoring based on string
        # intentially override the value of the scoring parameter rather than set self.scoring here,
        # to still run the check of scoring function parameters in the next code block

        if isinstance(scoring, six.string_types):
            if scoring == 'default':
                scoring = original_scorer
            elif scoring == 'npmi':
                scoring = npmi_scorer
            else:
                raise ValueError('unknown scoring method string %s specified' % (scoring))

        scoring_parameters = [
            'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'
        ]
        if callable(scoring):
            if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
                self.scoring = scoring
            else:
                raise ValueError('scoring function missing expected parameters')

        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.vocab = defaultdict(int)  # mapping between utf8 token => its count
        self.min_reduce = 1  # ignore any tokens with count smaller than this
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.corpus_word_count = 0
        self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms)

        # ensure picklability of custom scorer
        try:
            test_pickle = pickle.dumps(self.scoring)
            load_pickle = pickle.loads(test_pickle)
        except pickle.PickleError:
            raise pickle.PickleError('unable to pickle custom Phrases scoring function')
        finally:
            del(test_pickle)
            del(load_pickle)

        if sentences is not None:
            self.add_vocab(sentences)