コード例 #1
0
ファイル: seeker.py プロジェクト: christiaanw/theseeker
        def sent_heuristic(sentence, normalized=False):
            print(seed)
            words = sentence.replace(',', '').split()
            print(words)

            adj_relevance = [word_similarity(w+'-a', seed) for w in words]
            verb_relevance = [word_similarity(lemma(w)+'-n', seed) for w in words]
            noun_relevance = [word_similarity(lemma(w)+'-v', seed) for w in words]

            adj_relevance = [s for s in adj_relevance if s > 0]
            verb_relevance = [s for s in verb_relevance if s > 0]           
            noun_relevance = [s for s in noun_relevance if s > 0 ]

            total_rels = len(adj_relevance) + len(verb_relevance) + len(noun_relevance)
            #total_rels = (len(adj_relevance) + len(noun_relevance)) if seed.endswith('-n') else (len(verb_relevance) + len(noun_relevance))
            if total_rels == 0:
                total_rels = 1

            relevance = sum(adj_relevance) + sum(verb_relevance) + sum(noun_relevance)
            #relevance = (sum(adj_relevance) + sum(noun_relevance)) if seed.endswith('-n') else (sum(verb_relevance) + sum(noun_relevance))

            print("relevance: {}".format(relevance))

            score = score_sentence(sentence.replace(',', ''), normalized=normalized)
            print("score: {}".format(score))

            interpolation = ((relevance / total_rels) * 0.01) + score
            print("interpolation: {}".format(interpolation))

            return interpolation
コード例 #2
0
ファイル: seeker.py プロジェクト: thricedotted/theseeker
        def sent_heuristic(sentence, normalized=False):
            print(seed)
            words = sentence.replace(',', '').split()
            print(words)

            adj_relevance = [word_similarity(w + '-a', seed) for w in words]
            verb_relevance = [
                word_similarity(lemma(w) + '-n', seed) for w in words
            ]
            noun_relevance = [
                word_similarity(lemma(w) + '-v', seed) for w in words
            ]

            adj_relevance = [s for s in adj_relevance if s > 0]
            verb_relevance = [s for s in verb_relevance if s > 0]
            noun_relevance = [s for s in noun_relevance if s > 0]

            total_rels = len(adj_relevance) + len(verb_relevance) + len(
                noun_relevance)
            #total_rels = (len(adj_relevance) + len(noun_relevance)) if seed.endswith('-n') else (len(verb_relevance) + len(noun_relevance))
            if total_rels == 0:
                total_rels = 1

            relevance = sum(adj_relevance) + sum(verb_relevance) + sum(
                noun_relevance)
            #relevance = (sum(adj_relevance) + sum(noun_relevance)) if seed.endswith('-n') else (sum(verb_relevance) + sum(noun_relevance))

            print("relevance: {}".format(relevance))

            score = score_sentence(sentence.replace(',', ''),
                                   normalized=normalized)
            print("score: {}".format(score))

            interpolation = ((relevance / total_rels) * 0.01) + score
            print("interpolation: {}".format(interpolation))

            return interpolation
コード例 #3
0
ファイル: wikihow.py プロジェクト: christiaanw/theseeker
 def convert_words(v):
     return '_'.join(lemma(w.string)
             for w in v.words 
             if w.pos != "DT" and not w.pos.startswith('PRP') and w.string.isalpha()
             ).replace('-', '_')
コード例 #4
0
ファイル: wikihow.py プロジェクト: christiaanw/theseeker
 def convert_words(v):
     return ' '.join(lemma(w.string) if w.pos.startswith('VB') else w.string
             for w in v.words 
             if lemma(w.string) != "be" and w.string.lower() not in ('to', 'wo') and w.string.isalpha())
コード例 #5
0
ファイル: wikihow.py プロジェクト: christiaanw/theseeker
    def exploration_log(self, type="COMPUTER"):
        """
        Return a string of the exploration log for this article.
        """
        if type == "COMPUTER":

            def step_to_computer_verbs(string):
                parse = parsetree(string, relations=True)[0]

                def convert_words(v):
                    return ' '.join(lemma(w.string) if w.pos.startswith('VB') else w.string
                            for w in v.words 
                            if lemma(w.string) != "be" and w.string.lower() not in ('to', 'wo') and w.string.isalpha())

                chunks = [convert_words(v) for v in parse.verbs]
                
                return ' -- '.join(c for c in chunks if len(c) > 0).upper()

            def step_to_computer_nouns(string):
                parse = parsetree(string, relations=True)[0]

                def convert_words(v):
                    return '_'.join(lemma(w.string)
                            for w in v.words 
                            if w.pos != "DT" and not w.pos.startswith('PRP') and w.string.isalpha()
                            ).replace('-', '_')

                chunks = [convert_words(v) for v in parse.chunk if v.pos == 'NP']
                
                return ' | '.join(c for c in chunks if len(c) > 0).upper()

            def step_to_computer_adjs(string):
                try:
                    parse = parsetree(string, relations=True)[0]
                    return ' '.join(a.string for a in parse.adjectives if a.string.isalpha())
                except IndexError:
                    return ''

            verbs = [step_to_computer_verbs(s.main) for s in self.steps]
            verbs = [v if len(v) > 0 else '???' for v in verbs]

            nouns = [step_to_computer_nouns(s.main) for s in self.steps]
            nouns = [n if len(n) > 0 else '***' for n in nouns]

            adjs = [' '.join(set(' '.join((step_to_computer_adjs(s.main),
                     step_to_computer_adjs(s.extra))).split()))
                     for s in self.steps]
            adjs = [a.lower() if len(a) > 0 else '!null' for a in adjs]

            lines = ['{:02d}... {} ( {} ) => {}'.format(i, v, n, a)
                      for i, v, n, a in zip(range(len(verbs)), verbs, nouns, adjs)]

            title_words = self.title.lower().strip()[7:].split()
            title_words = [w for w in title_words if w not in ("your", "a", "an", "the")]

            if title_words[0].lower() != "not":
                title_words[0] = conjugate(lemma(title_words[0]), 'part').upper()
            else:
                title_words[1] = conjugate(lemma(title_words[1]), 'part').upper()

            title = "{0:^80}".format("- " + ' '.join(title_words) + " -").upper()

            fixed_lines = [textwrap.fill(textwrap.dedent(line).strip(),
                                         initial_indent='', 
                                         subsequent_indent='      ', 
                                         width=76)
                           for line in lines]

            return '\n\n'.join([title] + fixed_lines)

        else:
            raise NotImplementedError("Type {} of logging not implemented!".format(type))
コード例 #6
0
ファイル: seeker.py プロジェクト: thricedotted/theseeker
    def build_dream(self, wordlist, seed_concept):
        seed = seed_concept.lemma + '-n' if seed_concept.type == Concept.THING else seed_concept.lemma + '-v'

        words_and_lemmas = [(w, lemma(w.string)) for w in wordlist
                            if len(w.string) > 3 and w.string.isalpha()]

        noun_blacklist = [
            'none', 'male', 'female', 'women', 'men', 'hear', 'thru',
            'weirder', 'guy', 'mother', 'father', 'daughter', 'brother',
            'mama', 'wife', 'thing', 'soooo', 'chan'
        ]

        adj_blacklist = [
            'such', 'much', 'however', 'about', 'most', 'least', 'more',
            'less', 'else', 'enough', 'sooo'
        ]

        verb_blacklist = ['men', 'dont', 'left',
                          'seem']  # STOP SAYING MEN IS A VERB UGH

        adv_blacklist = [
            'most', 'least', 'kinda', 'quite', 'down', 'there', 'here', 'alot',
            'much', 'such', 'more', 'back', 'else', 'very', 'about', 'sooo',
            'rather', 'however', 'thus'
        ]

        print('getting nouns...')
        nounset = set(w_lemma for w, w_lemma in words_and_lemmas
                      if w.pos.startswith('NN') and w.pos != 'NNP'
                      and w_lemma not in noun_blacklist)

        print(len(nounset))
        nouns = [w for w in nounset if 'noun' in wordnik_pos(w)]

        print('getting adjs...')
        adjset = list(
            set(w.string.lower() for w, w_lemma in words_and_lemmas
                if w.pos.startswith('JJ')
                and w.string.lower() not in adj_blacklist))
        adjs = [w for w in adjset if 'adjective' in wordnik_pos(w)]

        print('getting verbs...')
        verbset = list(
            set(w_lemma for w, w_lemma in words_and_lemmas
                if w.pos.startswith('VB') and w_lemma not in verb_blacklist))
        verbs = [
            w for w in verbset
            if any(pos is not None and pos.startswith('verb')
                   for pos in wordnik_pos(w))
        ]

        print('getting advs...')
        advset = list(
            set(w.string.lower() for w, w_lemma in words_and_lemmas
                if w.pos.startswith('RB')
                and w.string.lower() not in adv_blacklist))
        advs = [w for w in advset if 'adverb' in wordnik_pos(w)]

        #random.shuffle(nouns)
        #random.shuffle(adjs)
        nouns = sorted(nouns,
                       key=lambda x: word_similarity(x + '-n', seed),
                       reverse=True)
        adjs = sorted(adjs,
                      key=lambda x: word_similarity(x + '-a', seed),
                      reverse=True)

        a_n_shortest = min(len(nouns), len(adjs))
        nps = [a_n for a_n in zip(adjs[:a_n_shortest], nouns[:a_n_shortest])
               ] + [('', n) for n in nouns[a_n_shortest:]]

        #random.shuffle(verbs)
        random.shuffle(advs)
        verbs = sorted(verbs,
                       key=lambda x: word_similarity(x + '-v', seed),
                       reverse=True)

        a_v_shortest = min(len(verbs), len(advs))
        vps = [a_v for a_v in zip(advs[:a_v_shortest], verbs[:a_v_shortest])
               ] + [('', v) for v in verbs[a_v_shortest:]]

        # sort by similarity
        nps = sorted(nps,
                     key=lambda x: max(word_similarity(x[0] + '-a', seed),
                                       word_similarity(x[1] + '-n', seed)))

        vps = sorted(vps, key=lambda x: word_similarity(x[1] + '-v', seed))

        #random.shuffle(nps)
        #random.shuffle(vps)

        def make_noun_string(np, plural=False):
            # random chance of removing modifier
            #if random.random() < 0.5:
            #    np[0] == ''

            # common mass nouns

            if np[1] in [
                    'data', 'information', 'children', 'people', 'stuff',
                    'equipment'
            ]:
                return ' '.join(np).strip()

            elif any(np[1].lower().startswith(x)
                     for x in ('every', 'any', 'some')) or np[1] in ('nothing',
                                                                     'nobody'):
                return np[1]

            quantifiers = [
                'many', 'few', 'several', 'various', 'multiple', 'fewer',
                'more'
            ]
            if np[0] in quantifiers:
                return np[0] + ' ' + pluralize(np[1])

            else:
                die_roll = random.random()
                if die_roll < 0.15 or plural:
                    return ' '.join((np[0], pluralize(np[1]))).strip()
                elif die_roll < 0.25:
                    return random.choice(
                        ('his', 'her', 'their',
                         'your')) + ' ' + ' '.join(np).strip()
                elif random.random() < 0.45:
                    return referenced(' '.join(np).strip())
                else:
                    return 'the ' + ' '.join(np).strip()

        def make_verb_string(vp, conj='part'):
            # random chance of removing modifier
            #if random.random() < 0.5:
            #    vp[0] == ''

            verb = conjugate(vp[1], conj)

            if verb == 'thinking':
                verb = 'thinking of'

            if verb == 'arriving':
                verb = 'arriving at'

            if verb == 'coming':
                verb = 'coming from'

            if verb == 'going':
                verb = 'going to'

            return ' '.join((vp[0], verb)).strip()

        def get_transitive_vp():
            vp = vps.pop()
            transitivity = is_transitive(vp[1])
            checked = []

            while not transitivity:
                if transitivity is not None:
                    checked.append(vp)

                vp = vps.pop()

                transitivity = is_transitive(vp[1])

            vps.extend(checked)
            return vp

        def get_intransitive_vp():
            vp = vps.pop()
            intransitivity = is_intransitive(vp[1])
            checked = []

            while not intransitivity:
                if intransitivity is not None:
                    checked.append(vp)

                vp = vps.pop()

                intransitivity = is_intransitive(vp[1])

            vps.extend(checked)
            return vp

        story = []

        while True:
            try:
                case = random.randint(0, 12)
                next_sent = ""
                if case == 0:
                    template = "{noun_string}, {verb_string}"

                    noun_string = make_noun_string(nps.pop())
                    verb_string = make_verb_string(get_intransitive_vp())

                    next_sent = template.format(noun_string=noun_string,
                                                verb_string=verb_string)

                elif case == 1:
                    template = "{noun_string} {verb_string}"

                    noun_string = make_noun_string(nps.pop(), plural=True)

                    verb_string = make_verb_string(get_intransitive_vp())

                    next_sent = template.format(noun_string=noun_string,
                                                verb_string=verb_string)

                elif case == 2:
                    template = "{} {} and {} {}"

                    two_vp = [
                        make_verb_string(get_transitive_vp()) for _ in range(3)
                    ]
                    two_np = [make_noun_string(nps.pop()) for _ in range(3)]

                    next_sent = template.format(two_vp[0], two_np[0],
                                                two_vp[1], two_np[1])

                elif case == 3:
                    template = "{verb_string} {noun_string}"

                    #np = nps.pop()
                    #noun_string = ' '.join((np[0], pluralize(np[1]))).strip()
                    noun_string = make_noun_string(nps.pop(), plural=True)

                    verb_string = make_verb_string(get_transitive_vp())

                    next_sent = template.format(noun_string=noun_string,
                                                verb_string=verb_string)

                elif 4 <= case <= 8:
                    preps = ('on', 'around', 'in', 'near', 'behind', 'over',
                             'under', 'like')
                    template = "{} " + random.choice(preps) + " {}"

                    noun_strings = [
                        make_noun_string(nps.pop()) for _ in range(2)
                    ]

                    if random.random() < 0.5:
                        next_sent = template.format(*noun_strings)
                    else:
                        verb_string = make_verb_string(get_transitive_vp())
                        next_sent = verb_string + ' ' + template.format(
                            *noun_strings)

                elif case == 9:
                    template = "{} while {}"

                    verb_strings = [
                        make_verb_string(get_intransitive_vp())
                        for _ in range(2)
                    ]

                    next_sent = template.format(*verb_strings)

                elif 10 <= case <= 12:
                    template = "{noun_string1} {verb_string} {noun_string2}"

                    noun_string1 = make_noun_string(nps.pop(), plural=True)
                    noun_string2 = make_noun_string(nps.pop(), plural=True)

                    verb_string = make_verb_string(get_transitive_vp())

                    next_sent = template.format(noun_string1=noun_string1,
                                                noun_string2=noun_string2,
                                                verb_string=verb_string)

                # move the adverb around
                """
                if random.random() < 0.5 and 'ly' in next_sent and ' and ' not in next_sent:
                    words = next_sent.split()
                    dont_move = ['actually', 'really', 'probably', 'nearly', 'solely']
                    ly_words = [w for w in words if w.endswith('ly') and w[0] not in 'aeiou' and w not in dont_move]
                    if len(ly_words) == 1:
                        ly_word = ly_words[0]
                        words.remove(ly_word)
                        words.append(ly_word)
                        next_sent = ' '.join(words)
                """

                story.append(next_sent)

            except IndexError:
                break
コード例 #7
0
ファイル: seeker.py プロジェクト: thricedotted/theseeker
 def censored_terms(sent):
     return [w for w in sent
             if any(w.pos.startswith(x) for x in ('NN', 'JJ', 'RB', 'VB')) \
             and not any(self.concepts.contains(lemma(w.string), t)
                         for t in (Concept.THING, Concept.ACTION, Concept.DESCRIPTOR))
            ]
コード例 #8
0
ファイル: concept.py プロジェクト: christiaanw/theseeker
    def contains(self, string, type):
        try:
            return lemma(string) in self._concept_sets[type]

        except KeyError:
            raise KeyError("Invalid or undefined concept type")
コード例 #9
0
ファイル: concept.py プロジェクト: christiaanw/theseeker
 def __init__(self, string, type):
     self.lemma = lemma(string)
     self.type = type
     self.relations = ConceptRelationSet()
コード例 #10
0
ファイル: concept.py プロジェクト: thricedotted/theseeker
    def contains(self, string, type):
        try:
            return lemma(string) in self._concept_sets[type]

        except KeyError:
            raise KeyError("Invalid or undefined concept type")
コード例 #11
0
ファイル: concept.py プロジェクト: thricedotted/theseeker
 def __init__(self, string, type):
     self.lemma = lemma(string)
     self.type = type
     self.relations = ConceptRelationSet()
コード例 #12
0
ファイル: seeker.py プロジェクト: christiaanw/theseeker
    def build_dream(self, wordlist, seed_concept):
        seed = seed_concept.lemma + '-n' if seed_concept.type == Concept.THING else seed_concept.lemma + '-v'
               
        words_and_lemmas = [(w, lemma(w.string)) for w in wordlist if len(w.string) > 3 and w.string.isalpha()]

        noun_blacklist = ['none', 'male', 'female', 'women', 'men', 
                            'hear', 'thru', 'weirder', 'guy', 'mother', 
                            'father', 'daughter', 'brother', 'mama', 
                            'wife', 'thing', 'soooo', 'chan']

        adj_blacklist = ['such', 'much', 'however', 'about', 'most', 'least',
                            'more', 'less', 'else', 'enough', 'sooo' ]

        verb_blacklist = ['men', 'dont', 'left', 'seem']    # STOP SAYING MEN IS A VERB UGH

        adv_blacklist = ['most', 'least', 'kinda', 'quite', 'down', 
                            'there', 'here', 'alot', 'much', 'such', 
                            'more', 'back', 'else', 'very', 'about', 
                            'sooo', 'rather', 'however', 'thus']

        print('getting nouns...')
        nounset = set(w_lemma for w, w_lemma in words_and_lemmas
                            if w.pos.startswith('NN') 
                            and w.pos != 'NNP' 
                            and w_lemma not in noun_blacklist
                        )

        print(len(nounset))
        nouns = [w for w in nounset if 'noun' in wordnik_pos(w)]

        print('getting adjs...')
        adjset = list(set(w.string.lower() for w, w_lemma in words_and_lemmas
                            if w.pos.startswith('JJ') 
                            and w.string.lower() not in adj_blacklist
                        ))
        adjs = [w for w in adjset if 'adjective' in wordnik_pos(w)]


        print('getting verbs...')
        verbset = list(set(w_lemma for w, w_lemma in words_and_lemmas 
                            if w.pos.startswith('VB') 
                            and w_lemma not in verb_blacklist
                        ))
        verbs = [w for w in verbset if any(pos is not None and pos.startswith('verb') for pos in wordnik_pos(w))]

        print('getting advs...')
        advset = list(set(w.string.lower() for w, w_lemma in words_and_lemmas
                            if w.pos.startswith('RB') 
                            and w.string.lower() not in adv_blacklist
                        ))
        advs = [w for w in advset if 'adverb' in wordnik_pos(w)]

        #random.shuffle(nouns)
        #random.shuffle(adjs)
        nouns = sorted(nouns, key=lambda x: word_similarity(x+'-n', seed), reverse=True)
        adjs = sorted(adjs, key=lambda x: word_similarity(x+'-a', seed), reverse=True)

        a_n_shortest = min(len(nouns), len(adjs))
        nps = [a_n for a_n in zip(adjs[:a_n_shortest], nouns[:a_n_shortest])] + [('', n) for n in nouns[a_n_shortest:]]

        #random.shuffle(verbs)
        random.shuffle(advs)
        verbs = sorted(verbs, key=lambda x: word_similarity(x+'-v', seed), reverse=True)

        a_v_shortest = min(len(verbs), len(advs))
        vps = [a_v for a_v in zip(advs[:a_v_shortest], verbs[:a_v_shortest])] + [('', v) for v in verbs[a_v_shortest:]]

        # sort by similarity
        nps = sorted(nps, key=lambda x: max(word_similarity(x[0]+'-a', seed),
                                            word_similarity(x[1]+'-n', seed)))

        vps = sorted(vps, key=lambda x: word_similarity(x[1]+'-v', seed))

        #random.shuffle(nps)
        #random.shuffle(vps)

        def make_noun_string(np, plural=False):
            # random chance of removing modifier
            #if random.random() < 0.5:
            #    np[0] == ''

            # common mass nouns

            if np[1] in ['data', 'information', 'children', 'people', 'stuff', 'equipment']:
                return ' '.join(np).strip()

            elif any(np[1].lower().startswith(x) for x in ('every', 'any', 'some')) or np[1] in ('nothing', 'nobody'):
                return np[1]

            quantifiers = ['many', 'few', 'several', 'various', 'multiple', 'fewer', 'more']
            if np[0] in quantifiers:
                return np[0] + ' ' + pluralize(np[1])

            else:
                die_roll = random.random()
                if die_roll < 0.15 or plural:
                    return ' '.join((np[0], pluralize(np[1]))).strip()
                elif die_roll < 0.25:
                    return random.choice(('his', 'her', 'their', 'your')) + ' ' + ' '.join(np).strip()
                elif random.random() < 0.45:
                    return referenced(' '.join(np).strip())
                else:
                    return 'the ' + ' '.join(np).strip()

        def make_verb_string(vp, conj='part'):
            # random chance of removing modifier
            #if random.random() < 0.5:
            #    vp[0] == ''

            verb = conjugate(vp[1], conj)

            if verb == 'thinking':
                verb = 'thinking of'

            if verb == 'arriving':
                verb = 'arriving at'

            if verb == 'coming':
                verb = 'coming from'

            if verb == 'going':
                verb = 'going to'

            return ' '.join((vp[0], verb)).strip() 
        
        def get_transitive_vp():
            vp = vps.pop()
            transitivity = is_transitive(vp[1])
            checked = []

            while not transitivity:
                if transitivity is not None:
                    checked.append(vp)

                vp = vps.pop()

                transitivity = is_transitive(vp[1])

            vps.extend(checked)
            return vp

        def get_intransitive_vp():
            vp = vps.pop()
            intransitivity = is_intransitive(vp[1])
            checked = []

            while not intransitivity:
                if intransitivity is not None:
                    checked.append(vp)

                vp = vps.pop()

                intransitivity = is_intransitive(vp[1])

            vps.extend(checked)
            return vp

        story = []

        while True:
            try:
                case = random.randint(0, 12)
                next_sent = ""
                if case == 0:
                    template = "{noun_string}, {verb_string}"

                    noun_string = make_noun_string(nps.pop())
                    verb_string = make_verb_string(get_intransitive_vp())

                    next_sent = template.format(noun_string=noun_string, verb_string=verb_string)

                elif case == 1:
                    template = "{noun_string} {verb_string}"

                    noun_string = make_noun_string(nps.pop(), plural=True)

                    verb_string = make_verb_string(get_intransitive_vp())

                    next_sent = template.format(noun_string=noun_string, verb_string=verb_string)

                elif case == 2:
                    template = "{} {} and {} {}"

                    two_vp = [make_verb_string(get_transitive_vp()) for _ in range(3)]
                    two_np = [make_noun_string(nps.pop()) for _ in range(3)]

                    next_sent = template.format(two_vp[0], two_np[0], two_vp[1], two_np[1])

                elif case == 3:
                    template = "{verb_string} {noun_string}"

                    #np = nps.pop()
                    #noun_string = ' '.join((np[0], pluralize(np[1]))).strip()
                    noun_string = make_noun_string(nps.pop(), plural=True)

                    verb_string = make_verb_string(get_transitive_vp())

                    next_sent = template.format(noun_string=noun_string, verb_string=verb_string)

                elif 4 <= case <= 8:
                    preps = ('on', 'around', 'in', 'near', 'behind', 'over', 'under', 'like')
                    template = "{} " + random.choice(preps) + " {}"

                    noun_strings = [make_noun_string(nps.pop()) for _ in range(2)]

                    if random.random() < 0.5:
                        next_sent = template.format(*noun_strings)
                    else:
                        verb_string = make_verb_string(get_transitive_vp())
                        next_sent = verb_string + ' ' + template.format(*noun_strings)

                elif case == 9:
                    template = "{} while {}"

                    verb_strings = [make_verb_string(get_intransitive_vp()) for _ in range(2)]

                    next_sent = template.format(*verb_strings)

                elif 10 <= case <= 12:
                    template = "{noun_string1} {verb_string} {noun_string2}"

                    noun_string1 = make_noun_string(nps.pop(), plural=True)
                    noun_string2 = make_noun_string(nps.pop(), plural=True)

                    verb_string = make_verb_string(get_transitive_vp())

                    next_sent = template.format(noun_string1=noun_string1, noun_string2=noun_string2, verb_string=verb_string)


                # move the adverb around
                """
                if random.random() < 0.5 and 'ly' in next_sent and ' and ' not in next_sent:
                    words = next_sent.split()
                    dont_move = ['actually', 'really', 'probably', 'nearly', 'solely']
                    ly_words = [w for w in words if w.endswith('ly') and w[0] not in 'aeiou' and w not in dont_move]
                    if len(ly_words) == 1:
                        ly_word = ly_words[0]
                        words.remove(ly_word)
                        words.append(ly_word)
                        next_sent = ' '.join(words)
                """

                story.append(next_sent)

            except IndexError:
                break

        story = [s for s in story if not any(s.endswith(x) for x in ('to', 'from', 'at', 'of'))]
        
        def sent_heuristic(sentence, normalized=False):
            print(seed)
            words = sentence.replace(',', '').split()
            print(words)

            adj_relevance = [word_similarity(w+'-a', seed) for w in words]
            verb_relevance = [word_similarity(lemma(w)+'-n', seed) for w in words]
            noun_relevance = [word_similarity(lemma(w)+'-v', seed) for w in words]

            adj_relevance = [s for s in adj_relevance if s > 0]
            verb_relevance = [s for s in verb_relevance if s > 0]           
            noun_relevance = [s for s in noun_relevance if s > 0 ]

            total_rels = len(adj_relevance) + len(verb_relevance) + len(noun_relevance)
            #total_rels = (len(adj_relevance) + len(noun_relevance)) if seed.endswith('-n') else (len(verb_relevance) + len(noun_relevance))
            if total_rels == 0:
                total_rels = 1

            relevance = sum(adj_relevance) + sum(verb_relevance) + sum(noun_relevance)
            #relevance = (sum(adj_relevance) + sum(noun_relevance)) if seed.endswith('-n') else (sum(verb_relevance) + sum(noun_relevance))

            print("relevance: {}".format(relevance))

            score = score_sentence(sentence.replace(',', ''), normalized=normalized)
            print("score: {}".format(score))

            interpolation = ((relevance / total_rels) * 0.01) + score
            print("interpolation: {}".format(interpolation))

            return interpolation


        raw_rank = sorted(story, key=lambda x: sent_heuristic(x), reverse=True)[:20]
        for s in raw_rank:
            print(seed, s)
            #raw_input()
        norm_rank = sorted(story, key=lambda x: sent_heuristic(x, normalized=True), reverse=True)[:10]
        for s in norm_rank:
            print(seed, s)
            #raw_input()

        reranked_story = list(set(raw_rank + norm_rank))
        random.shuffle(reranked_story)
        
        original = '. '.join(s.lower() for s in story if all(c.isalpha() or c in ' ,' for c in s)) + '.'
        reranked = '. '.join(s.lower() for s in reranked_story if all(c.isalpha() or c in ' ,' for c in s)) + '.'

        return reranked
コード例 #13
0
ファイル: seeker.py プロジェクト: christiaanw/theseeker
 def censored_terms(sent):
     return [w for w in sent
             if any(w.pos.startswith(x) for x in ('NN', 'JJ', 'RB', 'VB')) \
             and not any(self.concepts.contains(lemma(w.string), t) 
                         for t in (Concept.THING, Concept.ACTION, Concept.DESCRIPTOR))
            ]
コード例 #14
0
 def convert_words(v):
     return '_'.join(
         lemma(w.string) for w in v.words
         if w.pos != "DT" and not w.pos.startswith('PRP')
         and w.string.isalpha()).replace('-', '_')
コード例 #15
0
 def convert_words(v):
     return ' '.join(
         lemma(w.string) if w.pos.startswith('VB') else w.string
         for w in v.words
         if lemma(w.string) != "be" and w.string.lower() not in
         ('to', 'wo') and w.string.isalpha())
コード例 #16
0
    def exploration_log(self, type="COMPUTER"):
        """
        Return a string of the exploration log for this article.
        """
        if type == "COMPUTER":

            def step_to_computer_verbs(string):
                parse = parsetree(string, relations=True)[0]

                def convert_words(v):
                    return ' '.join(
                        lemma(w.string) if w.pos.startswith('VB') else w.string
                        for w in v.words
                        if lemma(w.string) != "be" and w.string.lower() not in
                        ('to', 'wo') and w.string.isalpha())

                chunks = [convert_words(v) for v in parse.verbs]

                return ' -- '.join(c for c in chunks if len(c) > 0).upper()

            def step_to_computer_nouns(string):
                parse = parsetree(string, relations=True)[0]

                def convert_words(v):
                    return '_'.join(
                        lemma(w.string) for w in v.words
                        if w.pos != "DT" and not w.pos.startswith('PRP')
                        and w.string.isalpha()).replace('-', '_')

                chunks = [
                    convert_words(v) for v in parse.chunk if v.pos == 'NP'
                ]

                return ' | '.join(c for c in chunks if len(c) > 0).upper()

            def step_to_computer_adjs(string):
                try:
                    parse = parsetree(string, relations=True)[0]
                    return ' '.join(a.string for a in parse.adjectives
                                    if a.string.isalpha())
                except IndexError:
                    return ''

            verbs = [step_to_computer_verbs(s.main) for s in self.steps]
            verbs = [v if len(v) > 0 else '???' for v in verbs]

            nouns = [step_to_computer_nouns(s.main) for s in self.steps]
            nouns = [n if len(n) > 0 else '***' for n in nouns]

            adjs = [
                ' '.join(
                    set(' '.join((step_to_computer_adjs(s.main),
                                  step_to_computer_adjs(s.extra))).split()))
                for s in self.steps
            ]
            adjs = [a.lower() if len(a) > 0 else '!null' for a in adjs]

            lines = [
                '{:02d}... {} ( {} ) => {}'.format(i, v, n, a)
                for i, v, n, a in zip(range(len(verbs)), verbs, nouns, adjs)
            ]

            title_words = self.title.lower().strip()[7:].split()
            title_words = [
                w for w in title_words if w not in ("your", "a", "an", "the")
            ]

            if title_words[0].lower() != "not":
                title_words[0] = conjugate(lemma(title_words[0]),
                                           'part').upper()
            else:
                title_words[1] = conjugate(lemma(title_words[1]),
                                           'part').upper()

            title = "{0:^80}".format("- " + ' '.join(title_words) +
                                     " -").upper()

            fixed_lines = [
                textwrap.fill(textwrap.dedent(line).strip(),
                              initial_indent='',
                              subsequent_indent='      ',
                              width=76) for line in lines
            ]

            return '\n\n'.join([title] + fixed_lines)

        else:
            raise NotImplementedError(
                "Type {} of logging not implemented!".format(type))