コード例 #1
0
def main():
    with open("test.txt", 'r', encoding="utf-8") as f:
        text = f.read()

    if (False):
        sentences = array(split_into_sentences(text, True))
        if (not len(sentences)):
            print("Nothing found")
            exit(-1)

        tags = pos_tag_sents(map(word_tokenize, sentences))

        lemmatized = lemmatize_sents(
            deepcopy(tags))  #Only for aesthetics reasons

        chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n "
                               "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n "
                               "}<DT>+{\n "
                               "PH: {<[B-Z]+>+}\n "
                               "}<DT|CC|PRP|EX|WDT>+{")
        chunked = list(chunker.parse_sents(lemmatized))

        droped = setup_search_structure(chunked, tuple)

        if (True):
            num_print = input("Full data of:[None] ")
            if (num_print):
                num_print = int(num_print)
                print()

                for num_print in range(num_print, num_print + 10):
                    print(sentences[num_print])
                    print()
                    print(tags[num_print])
                    print()
                    print(lemmatized[num_print])
                    print()
                    #chunks = ne_chunk_sents(tags)
                    #iob = [tree2conlltags(chunk) for chunk in chunks]
                    #iob = tree2conlltags(chunks)
                    #print(iob[num_print])
                    #print()
                    #tree = [conlltags2tree(i) for i in iob]
                    #print(tree[num_print])
                    #print()
                    #"NP: {<IN|TO>?((<IN>?<DT>?<JJ.?>*<CD>?<NN.*>+<POS>?)+<CD>?<FW>?)+}\n "
                    #"VP: {((<WP>?<PRP>?<MD>?<VB.?>?<JJ>*<TO>?<VB.?>+<RB>*(<JJ>*<TO>?)*)+<CC>?)+}\n "

                    print(chunked[num_print])
                    print("\n###\n")

                    print(droped[0][num_print])
                    print()

                    if (input(f"({num_print}) ?> ")): break

    ### Search params
    to_search = input("Search: ") or "work"
    tag = {
        '1': 'n',
        '2': 'v',
        '3': 'a',
        '4': 'r'
    }.get(
        input(f"\nWhat '{to_search}'?\n"
              "[1]: Noun\n"
              "[2]: Verb\n"
              "[3]: Adjective\n"
              "[4]: Adverb\n\n"
              "> "), None)
    syn = 'y' in input("\nFind related words too? ").lower()
    exact = 'y' in input("\nFind exact word? ").lower()
    print()

    _, ph_num_ls, sentences = analize_text(text, exact_words=exact)
    num = 1000000
    num2 = 10

    if (to_search):
        if (syn):
            w_rel = words_related(to_search, tag)
        else:
            w_rel = to_search

        ph_nums = find(w_rel, ph_num_ls)

    print()

    if (not len(ph_nums)):
        print(f"{to_search} not in text.")
        exit(0)

    if (False):
        print(f"Looking for \"{to_search}\" {num} times...\n")

        print(timeit.timeit("find(w_rel, ph_num_ls)",
                            number=num,
                            globals={
                                **globals(),
                                **locals()
                            }),
              end=' seconds\n\n')

    if (False):
        print(f"{num2} times text setup...\n")
        print(timeit.timeit("analize_text(text)",
                            number=num2,
                            globals={
                                **globals(),
                                **locals()
                            }),
              end=' seconds \n')

    if ("y" in input("Show found instances?[No] ")):
        from colorama import init as color_init
        color_init()

        print()
        if (not ph_nums is None):  # Unnecessary, but clean
            for ph in ph_nums:
                print(_color_sent(sentences[ph], w_rel))
                print()
        else:
            print("You did not specify any search param")
コード例 #2
0
 def __init__(self, formats: list):
     self.__text = "\n".join(formats)
     self.__parse = RegexpParser(self.__text)
コード例 #3
0
 def setParser(self, formats):
     if type(formats) is list:
         self.__text = "\n".join(formats)
     else:
         self.__text = formats
     self.__parse = RegexpParser(self.__text)
コード例 #4
0
class TextParser:
    """
    Utility class for processing text content.
    """

    substitutions = [
        (r"\b(im|i'm)\b", "i am"),
        (r"\b(id|i'd)\b", "i would"),
        (r"\b(i'll)\b", "i will"),
        (r"\bbf|b/f\b", "boyfriend"),
        (r"\bgf|g/f\b", "girlfriend"),
        (r"\byoure\b", "you are"),
        (r"\b(dont|don't)\b", "do not"),
        (r"\b(didnt|didn't)\b", "did not"),
        (r"\b(wasnt|wasn't)\b", "was not"),
        (r"\b(isnt|isn't)\b", "is not"),
        (r"\b(arent|aren't)\b", "are not"),
        (r"\b(werent|weren't)\b", "were not"),
        (r"\b(havent|haven't)\b", "have not"),
        (r"\b(couldnt|couldn't)\b", "could not"),
        (r"\b(hadnt|hadn't)\b", "had not"),
        (r"\b(wouldnt|wouldn't)\b", "would not"),
        (r"\bgotta\b", "have to"),
        (r"\bgonna\b", "going to"),
        (r"\bwanna\b", "want to"),
        (r"\b(kinda|kind of)\b", ""),
        (r"\b(sorta|sort of)\b", ""),
        (r"\b(dunno|donno)\b", "do not know"),
        (r"\b(cos|coz|cus|cuz)\b", "because"),
        (r"\bfave\b", "favorite"),
        (r"\bhubby\b", "husband"),
        (r"\bheres\b", "here is"),
        (r"\btheres\b", "there is"),
        (r"\bwheres\b", "where is"),
        # Common acronyms, abbreviations and slang terms
        (r"\birl\b", "in real life"),
        (r"\biar\b", "in a relationship"),
        (r"\btotes\b", "totally"),
        (r",", " and "),
        # Remove fluff phrases
        (r"\b(btw|by the way)\b", ""),
        (r"\b(tbh|to be honest)\b", ""),
        (r"\b(imh?o|in my( humble)? opinion)\b", ""),
        # Default POS tagger seems to always tag "like"
        # (and sometimes "love") as a noun - this is a bandaid fix for now
        (r"\bprefer\b", ""),
        (r"\b(like|love)\b", "prefer"),
    ]

    # Skip if any of these is the *only* attribute - for instance,
    # "I'm a big fan of Queen" makes sense, but "I'm a fan" doesn't.
    skip_lone_attributes = [
        "fan",
        "expert",
        "person",
        "advocate",
        "customer",
    ]

    # A select set of attributes we want to exclude.
    skip_attributes = [
        "supporter", "believer", "gender", "backer", "sucker", "chapter",
        "passenger", "super", "water", "sitter", "killer", "stranger",
        "monster", "leather", "holder", "creeper", "shower", "member",
        "wonder", "hungover", "sniper", "silver", "beginner", "lurker",
        "loser", "number", "stupider", "outlier", "molester", "hitler", "beer",
        "cucumber", "earlier", "denier", "lumber", "hamster", "abuser",
        "murderer", "dealer", "consumer", "wallpaper", "paper", "madder",
        "uber", "computer", "rubber", "door", "liquor", "traitor", "favor",
        "year", "ear", "liar", "rapist", "racist", "misogynist", "apologist",
        "sexist", "satan", "batman", "veteran", "ban", "hypocrite",
        "candidate", "lot", "f****t", "teapot", "shot", "foot", "idiot",
        "bigot", "robot"
    ]

    # A select set of attributes we want to include.
    include_attributes = [
        "geek",
        "nerd",
        "nurse",
        "cook",
        "student",
        "consultant",
        "mom",
        "dad",
        "marine",
        "chef",
        "sophomore",
        "catholic",
        "mod",
        # TODO - These make sense only when accompanied by
        # at least another noun
        #"person","enthusiast","fanboy","player","advocate",
    ]

    # Super awesome logic - if noun ends in any of these, it's *probably*
    # something we want to include/exclude. TODO - This is terrible logic,
    # see if we can implement actual NLP.
    include_attribute_endings = ("er", "or", "ar", "ist", "an", "ert", "ese",
                                 "te", "ot")
    exclude_attribute_endings = ("ing", "f****r")

    # "Filler" words (in sentences such as "I think...", "I guess...", etc.)
    skip_verbs = ["were", "think", "guess", "mean"]
    skip_prepositions = ["that"]
    skip_adjectives = ["sure", "glad", "happy", "afraid", "sorry", "certain"]
    skip_nouns = [
        "right", "way", "everything", "everyone", "things", "thing", "mine",
        "stuff", "lot"
    ]

    # Should _N include conjunctions?
    grammar = r"""
        # adverb* verb adverb* 
        # - really think, strongly suggest, look intensely
        _VP:  
            {<RB.*>*<V.*>+<RB.*>*}

        # determiner adjective noun(s)
        # - a beautiful house, the strongest fighter
        _N0:
            {(<DT>*<JJ.*>*<NN.*>+(?!<POS>))+}
        _N:
            {<_N0>+}     

        # noun to/in noun 
        # - newcomer to physics, big fan of Queen, newbie in gaming
        _N_PREP_N:
            {<_N>((<TO>|<IN>)<_N>)+}

        # my adjective noun(s) 
        # - my awesome phone
        POSS:
            {<PRP\$><_N>}

        # I verb in* adjective* noun
        # - I am a great chef, I like cute animals, 
        # - I work in beautiful* New York, I live in the suburbs
        ACT1:
            {<PRP><_VP><IN>*<_N>}

        # Above + to/in noun
        # - I am a fan of Jaymay, I have trouble with flannel
        ACT2:
            {<PRP><_VP><IN>*<_N_PREP_N>}
    """

    chunker = RegexpParser(grammar)

    def clean_up(self, text):
        """
        Removes unnecessary words from text and replaces common 
        misspellings/contractions with expanded words.

        """

        for original, rep in self.substitutions:
            text = re.sub(original, rep, text, flags=re.I)
        return text

    def normalize(self, word, tag="N"):
        """
        Normalizes word using given tag. If no tag is given, NOUN is assumed.
        
        """

        kind = NOUN
        if tag.startswith("V"):
            kind = VERB
        elif tag.startswith("RB"):
            kind = ADV
        elif tag.startswith("J"):
            kind = ADJ
        return Word(word).lemmatize(kind).lower()

    def pet_animal(self, word):
        """
        Returns word if word is in a predefined list of pet animals.
        
        """

        word = word.lower()
        if re.match(r"\b(dog|cat|hamster|fish|pig|snake|rat|parrot)\b", word):
            return word
        else:
            return None

    def family_member(self, word):
        """
        Returns normalized word if word is in a predefined list 
        of family members.
        
        """

        word = word.lower()
        if re.match(r"\b(mom|mother|mum|mommy)\b", word):
            return "mother"
        elif re.match(r"\b(dad|father|pa|daddy)\b", word):
            return "father"
        elif re.match(r"\b(brother|sister|son|daughter)s?\b", word):
            return word
        else:
            return None

    def relationship_partner(self, word):
        """
        Returns word if word is in a predefined list of relationship partners.
        
        """

        word = word.lower()
        if re.match(r"\b(ex-)*(boyfriend|girlfriend|so|wife|husband)\b", word):
            return word
        else:
            return None

    def gender(self, word):
        """
        Returns normalized word if word is in a predefined list of genders.
        
        """

        word = word.lower()
        if re.match(r"\b(girl|woman|female|lady|she)\b", word):
            return "female"
        elif re.match(r"\b(guy|man|male|he|dude)\b", word):
            return "male"
        else:
            return None

    def orientation(self, word):
        """
        Returns word if word is in a predefined list of sexual orientations.
        
        """

        word = word.lower()
        if re.match(r"\b(gay|straight|bi|bisexual|homosexual)\b", word):
            return word
        else:
            return None

    def process_verb_phrase(self, verb_tree):
        """
        Returns list of (word,tag) tuples given a verb tree.
        
        """

        if verb_tree.label() != "_VP":
            return None
        verb_phrase = [(w.lower(), t) for w, t in verb_tree.leaves()]
        return verb_phrase

    def process_noun_phrase(self, noun_tree):
        """
        Returns list of (word,tag) tuples given a noun tree.
        
        """

        if noun_tree.label() != "_N":
            return []
        if any(n in self.skip_nouns + stopwords for n, t in noun_tree.leaves()
               if t.startswith("N")):
            return []

        noun_phrase = [(w.lower(), t) for w, t in noun_tree.leaves()]
        return noun_phrase

    def process_npn_phrase(self, npn_tree):
        """
        Given a phrase of the form noun-preposition-noun, returns noun 
        and preposition-noun phrases.
        
        """

        if npn_tree.label() != "_N_PREP_N":
            return None
        noun_phrase = []
        prep_noun_phrase = []
        for i in range(len(npn_tree)):
            node = npn_tree[i]
            # we have hit the prepositions in a prep noun phrase
            if type(node) is tuple:
                w, t = node
                w = w.lower()
                prep_noun_phrase.append((w, t))
            else:
                if prep_noun_phrase:
                    prep_noun_phrase += self.process_noun_phrase(node)
                else:
                    noun_phrase = self.process_noun_phrase(node)
        return (noun_phrase, prep_noun_phrase)

    def process_possession(self, phrase):
        """
        Given a phrase, checks and returns a possession/belonging
        (my <word>) if exists.
        
        """

        noun_phrase = []

        for i in range(len(phrase)):
            node = phrase[i]
            if type(node) is tuple:  # word can only be pronoun
                w, t = node
                if t == "PRP$" and w.lower() != "my":
                    return None
            else:  # type has to be nltk.tree.Tree
                if node.label() == "_N":
                    noun_phrase = self.process_noun_phrase(node)
                else:  # what could this be?
                    pass
        if noun_phrase:
            return {"kind": "possession", "noun_phrase": noun_phrase}
        else:
            return None

    def process_action(self, phrase):
        """
        Given a phrase, checks and returns an action
        (I <verb-phrase>) if exists.
        
        """

        verb_phrase = []
        prepositions = []
        noun_phrase = []
        prep_noun_phrase = []

        for i in range(len(phrase)):
            node = phrase[i]
            if type(node) is tuple:  # word is either pronoun or preposition
                w, t = node
                if t == "PRP" and w.lower() != "i":
                    return None
                elif t == "IN":
                    prepositions.append((w.lower(), t))
                else:  # what could this be?!
                    pass
            else:
                if node.label() == "_VP":
                    verb_phrase = self.process_verb_phrase(node)
                elif node.label() == "_N":
                    noun_phrase = self.process_noun_phrase(node)
                elif node.label() == "_N_PREP_N":
                    noun_phrase, prep_noun_phrase = (
                        self.process_npn_phrase(node))

        if noun_phrase:
            return {
                "kind": "action",
                "verb_phrase": verb_phrase,
                "prepositions": prepositions,
                "noun_phrase": noun_phrase,
                "prep_noun_phrase": prep_noun_phrase
            }
        else:
            return None

    def extract_chunks(self, text):
        """
        Given a block of text, extracts and returns useful chunks.

        TODO - Should sentiments be excluded here?
        
        """

        chunks = []
        sentiments = []
        text = self.clean_up(text)
        blob = TextBlob(text,
                        pos_tagger=pattern_tagger,
                        analyzer=naive_bayes_analyzer)

        for sentence in blob.sentences:

            if (not sentence.tags
                    or not re.search(r"\b(i|my)\b", str(sentence), re.I)):
                continue

            tree = self.chunker.parse(sentence.tags)

            for subtree in tree.subtrees(
                    filter=lambda t: t.label() in ['POSS', 'ACT1', 'ACT2']):
                phrase = [(w.lower(), t) for w, t in subtree.leaves()]
                phrase_type = subtree.label()

                if not any(
                        x in [("i", "PRP"), ("my", "PRP$")]
                        for x in [(w, t) for w, t in phrase]
                ) or (phrase_type in ["ACT1", "ACT2"] and
                      (any(word in self.skip_verbs for word in
                           [w for w, t in phrase if t.startswith("V")])
                       or any(word in self.skip_prepositions
                              for word in [w for w, t in phrase if t == "IN"])
                       or any(word in self.skip_adjectives
                              for word in [w
                                           for w, t in phrase if t == "JJ"]))):
                    continue

                if subtree.label() == "POSS":
                    chunk = self.process_possession(subtree)
                    if chunk:
                        chunks.append(chunk)
                elif subtree.label() in ["ACT1", "ACT2"]:
                    chunk = self.process_action(subtree)
                    if chunk:
                        chunks.append(chunk)

        return (chunks, sentiments)

    def ngrams(self, text, n=2):
        """
        Returns a list of ngrams for given text.
        
        """
        return [" ".join(w) for w in TextBlob(text).ngrams(n=n)]

    def noun_phrases(self, text):
        """
        Returns list of TextBlob-derived noun phrases.
        
        """

        return TextBlob(text).noun_phrases

    def common_words(self, text):
        """
        Given a text, splits it into words and returns as a list 
        after excluding stop words.
        
        """

        return [
            word for word in list(TextBlob(text).words)
            if (word not in stopwords and word.isalpha())
        ]

    def total_word_count(self, text):
        """
        Returns total word count of a given text.
        
        """

        return len(list(TextBlob(text).words))

    def unique_word_count(self, text):
        """
        Returns unique word count of a given text.
        
        """

        return len(set(list(TextBlob(text).words)))

    def longest_word(self, text):
        """
        Returns longest word in a given text.
        
        """

        return max((list(TextBlob(text).words)), key=len)

    @staticmethod
    def test_sentence(sentence):
        """
        Prints TextBlob-derived tags for a given sentence.

        For testing purposes only.
        
        """

        print TextBlob(sentence).tags
コード例 #5
0
CA: { <JJR><VB.*>|<RB>?<JJ> }

# Adjectives
AJ: { <CA>(<CC>?<CA>)* }

# Entities
EN: {<AJ>?<NN.*|FW>+}

# Noun-phrases
NP: {<DT>?<CC>?(<CC><CD>)*<EN>(<CC>?<EN>)*}

# Rest should be considered as a Verb-Phrase Chunk
VP: {<.*>+}
}<NP>+{
'''
PARSER = RegexpParser(GRAMMAR)
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = stopwords.words('english')


class TextParser:
    @staticmethod
    def calculate_similarity(a, b) -> float:
        return SequenceMatcher(None, a, b).ratio()

    @staticmethod
    def generate_pos_tag_sets(input_string: str) -> next:
        """
    Break given string into sentences, and return their pos-tagged lists.\n
    **REQUIRES AN ACTIVE POS TAGGER TO BE RUNNING!!**
        :param input_string: input string. may contain one or more sentences
コード例 #6
0
ファイル: script.py プロジェクト: adelrio89/codecademy
pos_tagged_text = list()

# create a for loop through each word tokenized sentence here
for word in word_tokenized_text:
    # part-of-speech tag each sentence and append to list of pos-tagged sentences here
    pos_tagged_text.append(pos_tag(word))

# store and print any part-of-speech tagged sentence here
single_pos_sentence = pos_tagged_text[100]
print(single_pos_sentence)

# define noun phrase chunk grammar here
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object here
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar here
vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB><RB>?}"

# create verb phrase RegexpParser object here
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

# create a list to hold noun phrase chunked sentences and a list to hold verb phrase chunked sentences here
np_chunked_text = list()
vp_chunked_text = list()

# create a for loop through each pos-tagged sentence here
for sentence in pos_tagged_text:
    # chunk each sentence and append to lists here
    np_chunked_text.append(np_chunk_parser.parse(sentence))
コード例 #7
0
GRAMMER = 'NP: {<DT|PP\$>?<JJ.*>*<NN.*>+}\n{<JJ.*>*<NN*><CC>*<NN*>+}\n{<NNP>+}\n{<NN>+}'

# require
from nltk import RegexpParser, sent_tokenize, pos_tag, word_tokenize, FreqDist
import sys

# sanity check
if len(sys.argv) != 2:
    sys.stderr.write('Usage: ' + sys.argv[0] + " <file>\n")
    quit()

# get input
file = sys.argv[1]

# initialize
parser = RegexpParser(GRAMMER)

# open and read the input
handle = open(file, 'r')
data = handle.read()

# get all sentences and process them
sentences = sent_tokenize(data)
phrases = []
for sentence in sentences:

    # tokenize and tag the sentence
    sentence = (pos_tag(word_tokenize(sentence)))

    # parse the sentence and process each noun phrase
    tree = parser.parse(sentence)
コード例 #8
0
def getParse(sentence) -> str:
    # Preset
    nlp = StanfordCoreNLP('stanford-corenlp-4.2.0/', memory='8g')
    cc = OpenCC('t2s')

    # sentence = 'Those two splendid old electric trains.'
    print(
        "##################################################################################"
    )
    # # POS
    print('POS:', nlp.pos_tag(sentence))
    print(
        "##################################################################################"
    )

    # # Tokenize
    print('Tokenize:', nlp.word_tokenize(sentence))
    print(
        "##################################################################################"
    )

    # # NER
    print('NER:', nlp.ner(sentence))
    print(
        "##################################################################################"
    )

    # Parser
    tree = nlp.parse(sentence)
    parse_string = ' '.join(str(tree).split())
    print(parse_string)

    # ParserTest
    print('Parser:')
    print(nlp.parse(sentence))
    print(
        "##################################################################################"
    )

    #TREE Graph
    tagged = pos_tag(word_tokenize(sentence))
    # Extract all parts of speech from any text
    chunker = RegexpParser("""
                           NP: {<DT>?<JJ>*<NN>}    #To extract Noun Phrases
                           P: {<IN>}               #To extract Prepositions
                           V: {<V.*>}              #To extract Verbs
                           PP: {<P> <NP>}          #To extract Prepostional Phrases
                           VP: {<V> <NP|PP>*}      #To extarct Verb Phrases
                           """)

    # Print all parts of speech in above sentence
    output = chunker.parse(tagged)
    print("After Extracting\n", output)
    # To draw the parse tree
    output.draw()
    print(
        "##################################################################################"
    )

    # Close Stanford Parser
    nlp.close()
    return str(parse_string)
コード例 #9
0
def chunk_it_up(tagged_text):
    chunk_pattern = "Chunk: {<DT>?<JJ>*<NN>}"
    chunk_parser = RegexpParser(chunk_pattern)
    chunked = chunk_parser.parse(tagged_text)
    chunked.draw()
コード例 #10
0
ファイル: extract_server.py プロジェクト: vc2309/NLP-Proj
from nlp_consol import stanford_tree
# def stanford_tree(line):
# output = nlp.annotate(line, properties={
# 	'annotators': 'tokenize,ssplit,pos,parse',
# 	'outputFormat': 'json'
# })
# try:
# 	return output['sentences']
# except IndexError:
# 	pass

NN_grammar = r"""
	'Noun_phrase' : {<NN.*>+}
	"""

np_parser = RegexpParser(NN_grammar)


def get_np(parse_tree):
    if isinstance(parse_tree, Tree):
        all_np = []
        get_tokens = parse_tree.pos()
        fish_np = np_parser.parse(get_tokens)
        for obj in fish_np:
            if isinstance(obj, Tree):
                np_items = [x[0] for x in obj]
                all_np.append(' '.join(np_items))
        return all_np


# read_unmatched = open('whatsapp_unmatched.txt', 'r').read().split('\n')
コード例 #11
0
    # {<J.*>+<N.*>+}
    {<J.*>?<N.*>+}
    # {<N.*>+<OF>?<N.*>+}
    {<N.*>+<IN>?<DT>?<J.*>+<N.*>+}
    {<NNP>+<IN>?<DT>?<J.*>?<NNP>+}
    {<N.*>+<CC>?<DT>?<J.*>+<N.*>+}
    {<N.*>+<CC>?<DT>?<J.*>?<N.*>+}
    <``>{<.*>+}<''>
    <BRA>{<.*>+}<BRB>
    
    NUM:
    {<CD>+}

"""

rg_parser = RegexpParser(grammar=grammar)


def guess(qa):
    qd = qa['q']
    ast = qa['as']

    def cg(tg):
        if tg[1] == '(':
            return tg[0], 'BRA'
        if tg[1] == ')':
            return tg[0], 'BRB'
        if tg[0] == 'of':
            return 'of', 'OF'
        return tg
コード例 #12
0
def calculate_calories(descriptions):
    NP = "NP: {(<V\w+>|<NN\w?>)}"
    chunker = RegexpParser(NP)
    items = get_continuous_chunks(descriptions.lower(), chunker.parse)
    # items = ''.join(nouns)

    API_KEY = 'FEjjqylAG6cqOjq8n2sO1y3njopvccXmVPwIJYGs'
    url = 'https://api.nal.usda.gov/fdc/v1/foods/search?'

    total = 0
    for item in items:
        r = requests.get(url+'api_key={}&query={}'.format(API_KEY, item))
        res = r.json()
        nutrients = res['foods'][0]['foodNutrients']
        for nutrient in nutrients:
            if nutrient['nutrientName'] == 'Energy':
                calorie = nutrient['value']
                break
        if item == 'spicy':
            calorie = 0
        if item == 'cream':
            calorie = calorie / 5
        if item == 'sauce contains bacon':
            calorie = 0
        if item == 'note':
            calorie = 0
        if item == 'rib':
            calorie = 0
        if 'cheese' in item:
            calorie = calorie / 5
        if 'seeds' in item:
            calorie = calorie / 15
        if 'quinoa' in item:
            calorie = calorie / 5
        if 'dressing' in item:
            calorie = calorie / 2
        if 'sourdough' in item:
            calorie = calorie / 8
        if 'lemon' in item:
            calorie = calorie / 10
        if 'crushed' in item:
            calorie = calorie / 2
        if 'butter' in item:
            calorie = calorie / 100
        if 'granola' in item:
            calorie = calorie / 10
        if 'fruit' in item:
            calorie = calorie / 10
        if 'honey' in item:
            calorie = calorie / 10
        if 'compote' in item:
            calorie = calorie / 5
        if 'mayo' in item:
            calorie = calorie / 2
        if 'fried egg' in item:
            calorie = calorie / 10
        if 'potato bun' in item:
            calorie = calorie / 4
        if 'sauce' in item:
            calorie = calorie / 2
        if 'rigatoni' in item:
            calorie = calorie / 2
        if 'pesto' in item:
            calorie = calorie / 2
        if 'breadcrumbs' in item:
            calorie = calorie / 10
        if 'flakes' in item:
            calorie = calorie / 10
        if 'bacon' in item:
            calorie = calorie / 5
        if 'pappardelle' in item:
            calorie = calorie / 2
        
        total += calorie
    return jsonify({
        'calories': total
    })
コード例 #13
0
    print("After Lemmatization :")
    print(lemmatized_tokens)

    pos_tagged_word_list = pos_tag(lemmatized_tokens)
    print("After POS Tagging :")
    print(pos_tagged_word_list)

    grammar = """ NP: {<DT>?<JJ>*<NN>}
                      {<NNP>+}
                      {<NN><NN>}
                      {<NNS><VBP>}
                      {<V.*> <TO> <V.*>}
                      {<N.*>(4,)} """

    NPChunker = RegexpParser(grammar)
    chunked_result = NPChunker.parse(pos_tagged_word_list)
    shallow_parsed_set = list()

    for sub_tree in chunked_result:
        if type(sub_tree) is nltk.tree.Tree:
            if sub_tree.label() == 'NP':
                for w, t in sub_tree.leaves():
                    if 'NN' in t:
                        shallow_parsed_set.append(w)

    print("After Chunking (Shallow Parsing) :")
    print(shallow_parsed_set)

    hypernym_parsed_set = list()
    meronym_parsed_set = list()
コード例 #14
0
def taggerAndResultBuilder(emailInput):

    #Use a sent tokenizer (to maintain things like colons, for times, etc.)
    sentences = sent_tokenize(emailInput)
    sentencesBeforeTagging = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentencesBeforeTagging]

    # This was the best that I could possibly come up with given the time I had.

    overallGrammar = """
        CLAUSE0: {<IN>?<NNP>+<CD><CD>?}
    	CLAUSE1: {<DT><CD>}
    	DATE: {<CLAUSE0|CLAUSE1>}
    	CLAUSE2: {<VBZ>?<TO><CD><CC|NN|VBP|VBZ>?}
    	CLAUSE3: {<IN|VB><RB><CD|IN><CD>?<NN|NNS>}
    	CLAUSE4: {<IN><IN><CD><NN>?}
    	TIME_END: {<CLAUSE2|CLAUSE3|CLAUSE4>}
    	CLAUSE5: {<IN><DT>?<NN>*<NNP>+<NNPS>*<NN>?}
    	CLAUSE6: {<IN><DT><NN>}
    	CLAUSE7: {<TO><NNP>}
        LOCATION: {<CLAUSE5|CLAUSE6|CLAUSE7>}
        TIME_START: {<CD><NN|VBP|VBZ>?}
    	"""

    #	DA1: {<IN>?<NNP>+<CD><CD>?}
    #	DA2: {<DT><CD>}
    #	DATE: {<DA1|DA2>}
    #	TE3: {<IN><RB><CD><NN|NNS>}
    #	TE4: {<VB><RB><IN><CD>}
    #	TIME_END: {<TE1|TE2|TE3|TE4>}
    #	TS1: {<CD><NN|VBP|VBZ>?}
    #	TS2: {<VBZ><IN><CD>}
    #	TIME_START: {TS1|TS2}
    #   L1: {<IN><DT>?<NN>*<NNP>+<NNPS>*<NN>?}
    #    L2: {<IN><DT><NN>}
    #    L3: {<TO><NNP>}
    #    LOCATION: {<L1|L2|L3>}
    # 	"""

    # Location has an optional noun at the end in case the word "building" or "place" or something like this is included.

    # So in the off case that someone enters "am" instead of "A.M." then this
    # can actually be mistaken as a verb that's why there are the cases for
    # VBP and VBZ in TIME_START

    # Get grammar for nouns like "tonight", "tomorrow", "this afternoon", "this evening", etc.
    # And check to see if these nouns exist.  If they do then compare against the overall grammar
    # If there is no date then record these
    dateNounGrammar = """
        DATE1: {<JJ><NN>+}
        DATE2: {<DT><NN>+}
        DATE3: {<DT><NNP>+}
        DATE4: {<DT|JJ|NN><VBG>}
        DATE5: {<NN>+}
        DATE6: {<JJ><NNP>}
        """

    # DATE1 for catching things like "friday night" or "thursday night" where the day isn't capitalized and thus is JJ
    # DATE2 for "the evening time" or something like that.
    # DATE3 for "this Friday" (the tagger messes up the classification of capitalized days, etc.)
    # DATE4 for "this evening", or "(t/T)hursday evening"
    # DATE5 for "tonight", "tomorrow night", "this afternoon", "this evening", "lunch", "dinner", etc.
    # DATE6 for "this Friday", etc.

    # -----------------------------------------------------------------------------------------------------------
    # -----------------------------------------------------------------------------------------------------------
    #
    # This is now the Grammar that will be used to extract events.
    # Keep in mind that it is often the first noun in the scheduling email that will be found
    # This is a known fact in information extraction.
    #
    # For example see:
    # http://www.iosrjournals.org/iosr-jce/papers/Conf-%20ICFTE%E2%80%9916/Volume-1/12.%2072-79.pdf?id=7557
    #
    # I was also able to come up with a grammar based on all of the random sentences I generate.
    #
    # -----------------------------------------------------------------------------------------------------------
    # -----------------------------------------------------------------------------------------------------------

    eventGrammar = """
        EVENT1: {<DT><NN><VBG><NN>}
        EVENT2: {<DT|VBG><NN>+}
        EVENT3: {<VB|VBG><IN><NN>+}
        EVENT4: {<VBG|VBP><NNP>?<NNS>}
        EVENT5: {<NNS><VBP>}
        EVENT6: {<VB><NN|RP>}
        EVENT7: {<VB><DT><NN>}
        EVENT8: {<DT><NN><VBG><NN>}
        EVENT9: {<NN>}
        """
    # EVENT1 for "a cake eating contest
    # EVENT2 for "having lunch" or "a meeting", or "curriculum meeting" etc.
    # EVENT3 for "wrestling in space" or "wrestle in space" or "going for ice cream" , etc.
    # EVENT4 for things like "buying Guinness beer"
    # EVENT5 for "doctor's appointment"
    # EVENT6 for "drive home" or "run away"
    # EVENT7 for "running the tap"
    # EVENT8 for "lunch" or "dinner", etc.This is last because the other POS sequences should have priority.
    # EVENT9 for pretty much everything else that could be valid.

    # Extra location grammar

    # file_object = open( homeDirectory+ "testerDataOutput.txt", "a")
    dateTimeLocationAndEventList = []
    parser1 = RegexpParser(overallGrammar)
    parser2 = RegexpParser(dateNounGrammar)
    parser3 = RegexpParser(eventGrammar)
    for sentence in sentences:
        result1 = parser1.parse(sentence)
        result2 = parser2.parse(sentence)
        result3 = parser3.parse(sentence)
        dateTimeLocationAndEventResult = cleanTaggedExpressions(
            result1, result2, result3, sentencesBeforeTagging, emailInput)
        dateTimeLocationAndEventList.append(dateTimeLocationAndEventResult)

    resultString = ""
    for result in dateTimeLocationAndEventList:
        for iter in result:
            if ("undetermined" not in iter) and (iter not in resultString):
                resultString += iter
                resultString += ", "
    for info in infoTypes:
        if info not in resultString:
            resultString += info + ": undetermined"
            resultString += ", "

    resultString = resultString.rstrip(" ")
    resultString = resultString.lstrip(" ")
    resultString = resultString.rstrip(",")
    resultString = resultString.lstrip(",")

    # So if multiple dates were found by the tagger then just offer
    # The other date as additional info, this ultimately makes the program more robust!
    # Forget about checking times, because this is already double checked by regex!
    for info in infoTypes:
        if (info == 'DATE'):
            checkerInfo = info + ":"
            count = resultString.count(info)
            if (count > 1):
                newString = resultString.rsplit(info,
                                                resultString.count(info) - 1)
                new = info + "_ADDITIONAL_INFO_FOUND"
                resultString = new.join(newString)

    return resultString
コード例 #15
0
    lemmatizer = WordNetLemmatizer()
    for word in big_words:
        lemmatized_tokens.append(lemmatizer.lemmatize(word))
    #Add POS Tagging to each Lematized Words
    pos_tagged_list = pos_tag(lemmatized_tokens)  #[('Abu','NN')]



    #------------------ Chunking (Shallow Parsing) -----------------------
    grammar = """ NP: {<DT>?<JJ>*<NN>}
                      {<NNP>+}
                      {<NN><NN>}
                      {<NNS><VBP>}
                      {<V.*> <TO> <V.*>}
                      {<N.*>(4,)} """
    NPChunker = RegexpParser(grammar) #Chunking Rule
    # Return the best chunk structure for the given tokens and return a tree.
    # http://www.bogotobogo.com/python/NLTK/chunking_NLTK.php
    chunked_result = NPChunker.parse(pos_tagged_list)
    '''
    (S
        (NP accenture/NN)
        (NP plc/NN)
        (NP global/JJ management/NN)
    '''



    #------------------ Shallow Parsed List  -----------------------
    #We only need nouns that contribute to the sentence. Ignore everything else.
    shallow_parsed_list = list()
コード例 #16
0
content = inputFile.read()
inputFile.close()

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

contentSplit = word_tokenize(content)
print("After Split:", contentSplit)
tokens_tag = pos_tag(contentSplit)
print("After Token:", tokens_tag)
patterns = """groupeNom:{<JJ.*><NN.*>}
                        {<NN.*><NN.*>}
                        {<JJ.*><JJ.*><NN.*>}
                        {<JJ.*><NN.*><NN.*>}
          """
chunker = RegexpParser(patterns)
print("After Regex:", chunker)
output = chunker.parse(tokens_tag)
#for outputBuf in output:
#print("After Chunking",outputBuf);

#Creating output files:
outFile = open(outputPath, "w")
for outputBuffer in output:
    if (len(outputBuffer[0][0]) > 1):
        for outBufferSplit in outputBuffer:
            if (len(outBufferSplit[0]) > 2):
                outFile.write(outBufferSplit[0] + "\t" + outBufferSplit[1] +
                              "\t")
        outFile.write("\n")
outFile.close()
コード例 #17
0
from nltk import RegexpParser
from pos_tagged_oz import pos_tagged_oz
from np_chunk_counter import np_chunk_counter

# define noun-phrase chunk grammar here
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)


# create a list to hold noun-phrase chunked sentences
np_chunked_oz = list()

# create a for loop through each pos-tagged sentence in pos_tagged_oz here
for words in pos_tagged_oz:
  np_chunked_oz.append(chunk_parser.parse(words)
)

  

# store and print the most common np-chunks here
most_common_np_chunks = np_chunk_counter(np_chunked_oz)
print(most_common_np_chunks)
コード例 #18
0
            if first_idx is None:
                first_idx = int(token_range.split(':')[0])
            new_token_range = ':'.join(
                [str(int(x) - first_idx) for x in token_range.split(':')])
            new_line = line.strip()[:-len(token_range)] + new_token_range
            #print(token_range,new_token_range,first_idx,new_line)
            new_lines.append(new_line)
        new_texts.append('\n'.join(new_lines).strip())
    return new_texts


errors_to_correct = [
    (('Prepositions', 'Prepositional_verb', 'Prepositional_adjective',
      'Prepositional_adv', 'Prepositional_noun'), ('Spelling', ),
     PrepositionCorrector(), prepositions,
     RegexpParser('NP: {<IN|TO>?<DT|JJ.?|PRP\$|POS|RB.|CD|NN.*>*<NN.*|PRP>}'))
]
#(('Articles',('Spelling','Prepositions','Prepositional_verb','Prepositional_adjective','Prepositional_adv','Prepositional_noun'),
#ArticleCorrector(),['a','an','the','zero'],RegexpParser(r'NP: {<DT|JJ.?|PRP\$|POS|RB.|CD|NN.*>*<NN.*>}'))]

#regexp-based chunker

for err, preverr, corrector, options, chunker in errors_to_correct:
    predsp = None
    predst = None
    correct = []
    all_sents = []
    tagged_sents = []
    init_sents = []

    tn = 0
コード例 #19
0
def GetVerbPhrase(sentence):
       
    #print('GetNounPhrase is called')
    output = ''
    verb_token = ''

    #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times    
    grammar = 'VP: {<VB> | <VBP>}'

    #Create the Parser Object     
    cp = RegexpParser(grammar)
    
    #Tokenize the input and get part of speech  
    pos = pos_tag(word_tokenize(sentence))
    
    result = cp.parse(pos)
    
    #Debug: look at the tree formed
    #result.draw()
    #print(result)

    #Loop through the tree datastructure and pull the values under DNP node
    #we created for the result  
    for subtree in result.subtrees(filter=lambda t: t.label() == 'VP'):
        verb_token = ' '.join(item[0] for item in subtree.leaves()) 
    
    #print('verb found:' + verb_token)
    misclassified_verbs  = ['is', 'are', 'am', 'do']
    if verb_token in misclassified_verbs:
        return ''; #if it is a verb that cannot be converted just return blank
    
    if (len(verb_token.strip()) == 0):
        return verb_token.strip()  #if there's no verb just return blank

    #Second half of the program
    #Begin with creating a wordnet library object
    wn = wordnet        
    #debugging
    #wl = WordNetLemmatizer()
    #wn.lemma('give.v.01.give').derivationally_related_forms()

    #Use try catch loop because some verbs do not have a noun form and result
    #in exception error
    try:
        #create a lemma word of hte form verb + v.01 + verb => this is what wordnet lemma method takes
        lemma_word = verb_token + '.v.01.' + verb_token
 
        #debug to try
        # wn.lemma('perform.v.01.perform').derivationally_related_forms()
        
        #Call the lemma function and then derivationally_related_forms() to get all the applicable
        #word forms wordnet can give us
        lemma_output = wn.lemma(lemma_word).derivationally_related_forms()
        
        #debug
        #print(lemma_output)
        
        #if we find a noun form ending with ing, ial, ion we want it!
        for x in lemma_output:
            #print (x.name())
            if (re.search(r'ing$|ial$|ion$', x.name())):
                return x.name()
  
        #if its not one of the three above, return the first noun form found
        output = lemma_output[0].name()
    except:
        output = ''
        #Ideally handle the exception, in this case we return a blank
        #print("Oops!", sys.exc_info()[0], "occurred.")
    
    return output 
コード例 #20
0
text = '민병삼 대령의 항명행위로 초치했다'

from konlpy.tag import Okt
twitter = Okt()
words = twitter.pos(text, stem=True)
print(words)

from nltk import RegexpParser

grammar = """
NP: {<N.*>*<Suffix>?}   # 명사구를 정의한다
VP: {<V.*>*}            # 동사구를 정의한다
AP: {<A.*>*}            # 형용사구를 정의한다 """
parser = RegexpParser(grammar)
print(parser)

chunks = parser.parse(words)
#확print(chunks)