def extract_idea(self, idea): """ Given a proposed idea, find it in the corpus, add it to the list of discovered ideas, mark all positions in which it appears, and save its ngrams. """ idea_index = len(self.ideas) match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER)) bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2) count = 0 # doc_count = 0 for docid, wordids in self.data.docid_wordids.items(): # doc_count += 1 # sys.stderr.write("%s %s..." % (doc_count, len(wordids))) # sys.stderr.flush() # if doc_count % 100 == 0: # sys.stderr.write("%s " % doc_count) a, b = 0, 0 # start and end of sliding window doc_ideas = self.ideas_per_doc[docid] i = 0 # index in doc_ideas bag_of_words.use_doc(wordids) while bag_of_words.next_match() is not None: if i < len(doc_ideas): max_b = doc_ideas[i][0] else: max_b = len(wordids) a = bag_of_words.next_match() b = min(max_b, a + match_length) if b - a < len(idea) / 2: if i < len(doc_ideas): bag_of_words.slide_to(doc_ideas[i][1]) i += 1 continue else: break (score, start, end) = local_alignment_logprob(idea, wordids[a:b], self.align_scorer) while start != 0 and b < max_b: a += start b = min(max_b, a + match_length) (score, start, end) = local_alignment_logprob(idea, wordids[a:b], self.align_scorer) background_score = metrics.get_unigram_logprob(self.data, wordids[a+start:a+end]) result = _extract_idea(idea, self.N, idea_index, self.ideas_per_doc, None, self.ideas_ngrams, self.data) self.ideas.append(idea) self.ideas_counts.append(result[0])
def _extract_idea(idea, N, idea_index, ideas_per_doc, doc_ids=None, ideas_ngrams=None, data=None): """ Given a proposed idea, find it in the corpus, add it to the list of discovered ideas, mark all positions in which it appears, and save its ngrams. if doc_ids is None: all documents are used if ideas_ngrams is None: no ngram removals are recorded in an NGramCounter if data is None: global variable DATA is used """ if not data: data = DATA if not doc_ids: doc_ids = data.docid_wordids.keys() match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER)) bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2) count = 0 for docid in doc_ids: a, b = 0, 0 # start and end of sliding window doc_ideas = ideas_per_doc[docid] i = 0 # index in doc_ideas wordids = data.docid_wordids[docid] bag_of_words.use_doc(wordids) while bag_of_words.next_match() is not None: if i < len(doc_ideas): max_b = doc_ideas[i][0] else: max_b = len(wordids) a = bag_of_words.next_match() b = min(max_b, a + match_length) if b - a < len(idea) / 2: if i < len(doc_ideas): bag_of_words.slide_to(doc_ideas[i][1]) i += 1 continue else: break (score, start, end) = local_alignment(idea, wordids[a:b]) while start != 0 and b < max_b: a += start b = min(max_b, a + match_length) (score, start, end) = local_alignment(idea, wordids[a:b]) #background_score = metrics.get_unigram_logprob(data, wordids[a+start:a+end]) #print score, background_score if end != 0 and score > 1: doc_ideas.insert(i, (a + start, a + end, idea_index)) if ideas_ngrams: ideas_ngrams.remove_text( docid, a + start - N + 1, a + end + N - 1) count += 1 i += 1 bag_of_words.slide_to(a + end) else: bag_of_words.slide() return (count, ideas_per_doc, ideas_ngrams)
def extract_idea(self, idea): """ Given a proposed idea, find it in the corpus, add it to the list of discovered ideas, mark all positions in which it appears, and save its ngrams. """ idea_index = len(self.ideas) match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER)) bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2) count = 0 # doc_count = 0 for docid, wordids in self.data.docid_wordids.items(): # doc_count += 1 # sys.stderr.write("%s %s..." % (doc_count, len(wordids))) # sys.stderr.flush() # if doc_count % 100 == 0: # sys.stderr.write("%s " % doc_count) a, b = 0, 0 # start and end of sliding window doc_ideas = self.ideas_per_doc[docid] i = 0 # index in doc_ideas bag_of_words.use_doc(wordids) while bag_of_words.next_match() is not None: if i < len(doc_ideas): max_b = doc_ideas[i][0] else: max_b = len(wordids) a = bag_of_words.next_match() b = min(max_b, a + match_length) if b - a < len(idea) / 2: if i < len(doc_ideas): bag_of_words.slide_to(doc_ideas[i][1]) i += 1 continue else: break (score, start, end) = local_alignment_logprob(idea, wordids[a:b], \ self.align_scorer) while start != 0 and b < max_b: a += start b = min(max_b, a + match_length) (score, start, end) = local_alignment_logprob(idea, \ wordids[a:b], \ self.align_scorer) if end != 0: # different criteria: # TODO find a better critieria doc_ideas.insert(i, (a + start, a + end, idea_index)) self.ideas_ngrams.remove_text( docid, a + start - self.N + 1, a + end + self.N - 1) count += 1 i += 1 bag_of_words.slide_to(a + end) else: bag_of_words.slide() self.ideas.append(idea) self.ideas_counts.append(count)