Esempio n. 1
0
    def extract_idea(self, idea):
        """
        Given a proposed idea, find it in the corpus, add it to the list of
        discovered ideas, mark all positions in which it appears,
        and save its ngrams.
        """
        idea_index = len(self.ideas)

        match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER))
        bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2)
        count = 0

        # doc_count = 0
        for docid, wordids in self.data.docid_wordids.items():
            # doc_count += 1
            # sys.stderr.write("%s %s..." % (doc_count, len(wordids)))
            # sys.stderr.flush()
            # if doc_count % 100 == 0:
            #     sys.stderr.write("%s " % doc_count)

            a, b = 0, 0  # start and end of sliding window
            doc_ideas = self.ideas_per_doc[docid]
            i = 0  # index in doc_ideas
            bag_of_words.use_doc(wordids)

            while bag_of_words.next_match() is not None:
                if i < len(doc_ideas):
                    max_b = doc_ideas[i][0]
                else:
                    max_b = len(wordids)

                a = bag_of_words.next_match()
                b = min(max_b, a + match_length)

                if b - a < len(idea) / 2:
                    if i < len(doc_ideas):
                        bag_of_words.slide_to(doc_ideas[i][1])
                        i += 1
                        continue
                    else:
                        break

                (score, start, end) = local_alignment_logprob(idea, wordids[a:b], self.align_scorer)

                while start != 0 and b < max_b:
                    a += start
                    b = min(max_b, a + match_length)
                    (score, start, end) = local_alignment_logprob(idea, wordids[a:b], self.align_scorer)
                    
                background_score = metrics.get_unigram_logprob(self.data, wordids[a+start:a+end])

        result = _extract_idea(idea, self.N, idea_index, self.ideas_per_doc,
                               None, self.ideas_ngrams, self.data)

        self.ideas.append(idea)
        self.ideas_counts.append(result[0])
Esempio n. 2
0
def _extract_idea(idea, N, idea_index, ideas_per_doc,
                  doc_ids=None,
                  ideas_ngrams=None,
                  data=None):
    """
    Given a proposed idea, find it in the corpus, add it to the list of
    discovered ideas, mark all positions in which it appears,
    and save its ngrams.

    if doc_ids is None: all documents are used
    if ideas_ngrams is None: no ngram removals are recorded in an NGramCounter
    if data is None: global variable DATA is used
    """

    if not data:
        data = DATA
    if not doc_ids:
        doc_ids = data.docid_wordids.keys()

    match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER))
    bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2)
    count = 0

    for docid in doc_ids:
        a, b = 0, 0  # start and end of sliding window
        doc_ideas = ideas_per_doc[docid]
        i = 0  # index in doc_ideas
        wordids = data.docid_wordids[docid]
        bag_of_words.use_doc(wordids)

        while bag_of_words.next_match() is not None:
            if i < len(doc_ideas):
                max_b = doc_ideas[i][0]
            else:
                max_b = len(wordids)

            a = bag_of_words.next_match()
            b = min(max_b, a + match_length)

            if b - a < len(idea) / 2:
                if i < len(doc_ideas):
                    bag_of_words.slide_to(doc_ideas[i][1])
                    i += 1
                    continue
                else:
                    break

            (score, start, end) = local_alignment(idea, wordids[a:b])

            while start != 0 and b < max_b:
                a += start
                b = min(max_b, a + match_length)
                (score, start, end) = local_alignment(idea, wordids[a:b])

            #background_score = metrics.get_unigram_logprob(data, wordids[a+start:a+end])

            #print score, background_score

            if end != 0 and score > 1:
                doc_ideas.insert(i, (a + start, a + end, idea_index))
                if ideas_ngrams:
                    ideas_ngrams.remove_text(
                        docid, a + start - N + 1, a + end + N - 1)
                count += 1
                i += 1
                bag_of_words.slide_to(a + end)
            else:
                bag_of_words.slide()

    return (count, ideas_per_doc, ideas_ngrams)
Esempio n. 3
0
    def extract_idea(self, idea):
        """
        Given a proposed idea, find it in the corpus, add it to the list of
        discovered ideas, mark all positions in which it appears,
        and save its ngrams.
        """

        idea_index = len(self.ideas)
        match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER))
        bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2)
        count = 0

        # doc_count = 0
        for docid, wordids in self.data.docid_wordids.items():
            # doc_count += 1
            # sys.stderr.write("%s %s..." % (doc_count, len(wordids)))
            # sys.stderr.flush()
            # if doc_count % 100 == 0:
            #     sys.stderr.write("%s " % doc_count)

            a, b = 0, 0  # start and end of sliding window
            doc_ideas = self.ideas_per_doc[docid]
            i = 0  # index in doc_ideas
            bag_of_words.use_doc(wordids)

            while bag_of_words.next_match() is not None:
                if i < len(doc_ideas):
                    max_b = doc_ideas[i][0]
                else:
                    max_b = len(wordids)

                a = bag_of_words.next_match()
                b = min(max_b, a + match_length)

                if b - a < len(idea) / 2:
                    if i < len(doc_ideas):
                        bag_of_words.slide_to(doc_ideas[i][1])
                        i += 1
                        continue
                    else:
                        break

                (score, start, end) = local_alignment_logprob(idea, wordids[a:b], \
                                                              self.align_scorer)

                while start != 0 and b < max_b:
                    a += start
                    b = min(max_b, a + match_length)
                    (score, start, end) = local_alignment_logprob(idea, \
                                                                  wordids[a:b], \
                                                                  self.align_scorer)

                if end != 0: # different criteria:  # TODO find a better critieria
                    doc_ideas.insert(i, (a + start, a + end, idea_index))
                    self.ideas_ngrams.remove_text(
                        docid, a + start - self.N + 1, a + end + self.N - 1)
                    count += 1
                    i += 1
                    bag_of_words.slide_to(a + end)
                else:
                    bag_of_words.slide()

        self.ideas.append(idea)
        self.ideas_counts.append(count)