Python SlidingBagOfWords.use_docの例

プログラミング言語: Python

名前空間/パッケージ名: utils.datastructures

クラス/型: SlidingBagOfWords

メソッド/関数: use_doc

hotexamples.comのコード掲載数: 3

Python SlidingBagOfWords.use_doc - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのutils.datastructures.SlidingBagOfWords.use_docの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

next_match(3)

slide_to(3)

use_doc(3)

slide(2)

よく使われるメソッド

next_match (3)

slide_to (3)

use_doc (3)

slide (2)

コード例 #1

ファイルを表示

ファイル: ideafinder.py プロジェクト: wpli/ptr

    def extract_idea(self, idea):
        """
        Given a proposed idea, find it in the corpus, add it to the list of
        discovered ideas, mark all positions in which it appears,
        and save its ngrams.
        """
        idea_index = len(self.ideas)

        match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER))
        bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2)
        count = 0

        # doc_count = 0
        for docid, wordids in self.data.docid_wordids.items():
            # doc_count += 1
            # sys.stderr.write("%s %s..." % (doc_count, len(wordids)))
            # sys.stderr.flush()
            # if doc_count % 100 == 0:
            #     sys.stderr.write("%s " % doc_count)

            a, b = 0, 0  # start and end of sliding window
            doc_ideas = self.ideas_per_doc[docid]
            i = 0  # index in doc_ideas
            bag_of_words.use_doc(wordids)

            while bag_of_words.next_match() is not None:
                if i < len(doc_ideas):
                    max_b = doc_ideas[i][0]
                else:
                    max_b = len(wordids)

                a = bag_of_words.next_match()
                b = min(max_b, a + match_length)

                if b - a < len(idea) / 2:
                    if i < len(doc_ideas):
                        bag_of_words.slide_to(doc_ideas[i][1])
                        i += 1
                        continue
                    else:
                        break

                (score, start, end) = local_alignment_logprob(idea, wordids[a:b], self.align_scorer)

                while start != 0 and b < max_b:
                    a += start
                    b = min(max_b, a + match_length)
                    (score, start, end) = local_alignment_logprob(idea, wordids[a:b], self.align_scorer)
                    
                background_score = metrics.get_unigram_logprob(self.data, wordids[a+start:a+end])

        result = _extract_idea(idea, self.N, idea_index, self.ideas_per_doc,
                               None, self.ideas_ngrams, self.data)

        self.ideas.append(idea)
        self.ideas_counts.append(result[0])

コード例 #2

ファイルを表示

ファイル: ideafinder.py プロジェクト: wpli/ptr

def _extract_idea(idea, N, idea_index, ideas_per_doc,
                  doc_ids=None,
                  ideas_ngrams=None,
                  data=None):
    """
    Given a proposed idea, find it in the corpus, add it to the list of
    discovered ideas, mark all positions in which it appears,
    and save its ngrams.

    if doc_ids is None: all documents are used
    if ideas_ngrams is None: no ngram removals are recorded in an NGramCounter
    if data is None: global variable DATA is used
    """

    if not data:
        data = DATA
    if not doc_ids:
        doc_ids = data.docid_wordids.keys()

    match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER))
    bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2)
    count = 0

    for docid in doc_ids:
        a, b = 0, 0  # start and end of sliding window
        doc_ideas = ideas_per_doc[docid]
        i = 0  # index in doc_ideas
        wordids = data.docid_wordids[docid]
        bag_of_words.use_doc(wordids)

        while bag_of_words.next_match() is not None:
            if i < len(doc_ideas):
                max_b = doc_ideas[i][0]
            else:
                max_b = len(wordids)

            a = bag_of_words.next_match()
            b = min(max_b, a + match_length)

            if b - a < len(idea) / 2:
                if i < len(doc_ideas):
                    bag_of_words.slide_to(doc_ideas[i][1])
                    i += 1
                    continue
                else:
                    break

            (score, start, end) = local_alignment(idea, wordids[a:b])

            while start != 0 and b < max_b:
                a += start
                b = min(max_b, a + match_length)
                (score, start, end) = local_alignment(idea, wordids[a:b])

            #background_score = metrics.get_unigram_logprob(data, wordids[a+start:a+end])

            #print score, background_score

            if end != 0 and score > 1:
                doc_ideas.insert(i, (a + start, a + end, idea_index))
                if ideas_ngrams:
                    ideas_ngrams.remove_text(
                        docid, a + start - N + 1, a + end + N - 1)
                count += 1
                i += 1
                bag_of_words.slide_to(a + end)
            else:
                bag_of_words.slide()

    return (count, ideas_per_doc, ideas_ngrams)

コード例 #3

ファイルを表示

ファイル: ideafinder2.py プロジェクト: wpli/ptr

    def extract_idea(self, idea):
        """
        Given a proposed idea, find it in the corpus, add it to the list of
        discovered ideas, mark all positions in which it appears,
        and save its ngrams.
        """

        idea_index = len(self.ideas)
        match_length = int(math.ceil(len(idea) * LENGTH_MULTIPLIER))
        bag_of_words = SlidingBagOfWords(idea, match_length, len(idea) / 2)
        count = 0

        # doc_count = 0
        for docid, wordids in self.data.docid_wordids.items():
            # doc_count += 1
            # sys.stderr.write("%s %s..." % (doc_count, len(wordids)))
            # sys.stderr.flush()
            # if doc_count % 100 == 0:
            #     sys.stderr.write("%s " % doc_count)

            a, b = 0, 0  # start and end of sliding window
            doc_ideas = self.ideas_per_doc[docid]
            i = 0  # index in doc_ideas
            bag_of_words.use_doc(wordids)

            while bag_of_words.next_match() is not None:
                if i < len(doc_ideas):
                    max_b = doc_ideas[i][0]
                else:
                    max_b = len(wordids)

                a = bag_of_words.next_match()
                b = min(max_b, a + match_length)

                if b - a < len(idea) / 2:
                    if i < len(doc_ideas):
                        bag_of_words.slide_to(doc_ideas[i][1])
                        i += 1
                        continue
                    else:
                        break

                (score, start, end) = local_alignment_logprob(idea, wordids[a:b], \
                                                              self.align_scorer)

                while start != 0 and b < max_b:
                    a += start
                    b = min(max_b, a + match_length)
                    (score, start, end) = local_alignment_logprob(idea, \
                                                                  wordids[a:b], \
                                                                  self.align_scorer)

                if end != 0: # different criteria:  # TODO find a better critieria
                    doc_ideas.insert(i, (a + start, a + end, idea_index))
                    self.ideas_ngrams.remove_text(
                        docid, a + start - self.N + 1, a + end + self.N - 1)
                    count += 1
                    i += 1
                    bag_of_words.slide_to(a + end)
                else:
                    bag_of_words.slide()

        self.ideas.append(idea)
        self.ideas_counts.append(count)