コード例 #1
0
def analize_text(text: str, *, exact_words: bool = False) -> tuple:
    sentences = array(split_into_sentences(text, True))
    if (not len(sentences)):
        print("Nothing found")
        return []

    tags = pos_tag_sents(map(word_tokenize, sentences))

    if (not exact_words):
        lemmatized = lemmatize_sents(tags)
    else:
        lemmatized = tags

    chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n "
                           "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n "
                           "}<DT>+{\n "
                           "PH: {<[B-Z]+>+}\n "
                           "}<DT|CC|PRP|EX|WDT>+{")

    chunked = list(chunker.parse_sents(lemmatized))

    return (*setup_search_structure(chunked, tuple), sentences)
コード例 #2
0
def main():
    with open("test.txt", 'r', encoding="utf-8") as f:
        text = f.read()

    if (False):
        sentences = array(split_into_sentences(text, True))
        if (not len(sentences)):
            print("Nothing found")
            exit(-1)

        tags = pos_tag_sents(map(word_tokenize, sentences))

        lemmatized = lemmatize_sents(
            deepcopy(tags))  #Only for aesthetics reasons

        chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n "
                               "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n "
                               "}<DT>+{\n "
                               "PH: {<[B-Z]+>+}\n "
                               "}<DT|CC|PRP|EX|WDT>+{")
        chunked = list(chunker.parse_sents(lemmatized))

        droped = setup_search_structure(chunked, tuple)

        if (True):
            num_print = input("Full data of:[None] ")
            if (num_print):
                num_print = int(num_print)
                print()

                for num_print in range(num_print, num_print + 10):
                    print(sentences[num_print])
                    print()
                    print(tags[num_print])
                    print()
                    print(lemmatized[num_print])
                    print()
                    #chunks = ne_chunk_sents(tags)
                    #iob = [tree2conlltags(chunk) for chunk in chunks]
                    #iob = tree2conlltags(chunks)
                    #print(iob[num_print])
                    #print()
                    #tree = [conlltags2tree(i) for i in iob]
                    #print(tree[num_print])
                    #print()
                    #"NP: {<IN|TO>?((<IN>?<DT>?<JJ.?>*<CD>?<NN.*>+<POS>?)+<CD>?<FW>?)+}\n "
                    #"VP: {((<WP>?<PRP>?<MD>?<VB.?>?<JJ>*<TO>?<VB.?>+<RB>*(<JJ>*<TO>?)*)+<CC>?)+}\n "

                    print(chunked[num_print])
                    print("\n###\n")

                    print(droped[0][num_print])
                    print()

                    if (input(f"({num_print}) ?> ")): break

    ### Search params
    to_search = input("Search: ") or "work"
    tag = {
        '1': 'n',
        '2': 'v',
        '3': 'a',
        '4': 'r'
    }.get(
        input(f"\nWhat '{to_search}'?\n"
              "[1]: Noun\n"
              "[2]: Verb\n"
              "[3]: Adjective\n"
              "[4]: Adverb\n\n"
              "> "), None)
    syn = 'y' in input("\nFind related words too? ").lower()
    exact = 'y' in input("\nFind exact word? ").lower()
    print()

    _, ph_num_ls, sentences = analize_text(text, exact_words=exact)
    num = 1000000
    num2 = 10

    if (to_search):
        if (syn):
            w_rel = words_related(to_search, tag)
        else:
            w_rel = to_search

        ph_nums = find(w_rel, ph_num_ls)

    print()

    if (not len(ph_nums)):
        print(f"{to_search} not in text.")
        exit(0)

    if (False):
        print(f"Looking for \"{to_search}\" {num} times...\n")

        print(timeit.timeit("find(w_rel, ph_num_ls)",
                            number=num,
                            globals={
                                **globals(),
                                **locals()
                            }),
              end=' seconds\n\n')

    if (False):
        print(f"{num2} times text setup...\n")
        print(timeit.timeit("analize_text(text)",
                            number=num2,
                            globals={
                                **globals(),
                                **locals()
                            }),
              end=' seconds \n')

    if ("y" in input("Show found instances?[No] ")):
        from colorama import init as color_init
        color_init()

        print()
        if (not ph_nums is None):  # Unnecessary, but clean
            for ph in ph_nums:
                print(_color_sent(sentences[ph], w_rel))
                print()
        else:
            print("You did not specify any search param")