Ejemplo n.º 1
0
    def __add_basic_pos_tag(df):
        pos_path_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar"
        pos_path_model = "./stanford-postagger-full-2017-06-09/models/english-left3words-distsim.tagger"
        pos_tagger = StanfordPOSTagger(pos_path_model, pos_path_jar)

        pos = [pos_tagger.tag(s) for s in [df.word]]

        pos = [i[1] for i in pos[0]]

        pos = pd.DataFrame(pos)

        df['pos'] = pos

        return df
Ejemplo n.º 2
0
class POSTagger(BaseEstimator, TransformerMixin):
    def __init__(self, models_path=None):
        models_path = models_path or os.environ["MODELS_PATH"]
        jar_file = Path(models_path, "stanford-postagger.jar")
        tagger_file = Path(models_path, "spanish.tagger")

        self.tagger = StanfordPOSTagger(str(tagger_file), str(jar_file))

    def tag(self, token_list):
        tags = self.tagger.tag(token_list)
        _, tags = zip(*tags)
        return list(tags)

    def transform(self, x, y=None):
        return [self.tag(sequence) for sequence in x]
Ejemplo n.º 3
0
        guessedAnswerText = ''.join(ch for ch in guessedAnswerText if ch not in PunctuationExclude)  ######
        if guessedAnswerText != "" and guessedAnswerText[0] == ' ':
            guessedAnswerText = guessedAnswerText[1:]  # remove the first space
            # print(guessedAnswerText)


        if guessedAnswerText == question['answer']:
            correct +=1

        elif questionType == 'NUMBER':
            wrongNumber += 1
            # print(question['question'])
            # print(taggedBestAnswerSent)
            # print(questionType)
            # print(guessedAnswerText)
            # print("-----" + question['answer'])
        print(i / float(8460), ":", correct)


print("wrong in selected cat",wrongNumber)
print("total",i)
print("correct",correct)
print("correct in multi ans",possCorrect)
print("avg multi ans len", totalans/float(multiAnswer))
print(multiAnswer)




print(stanford_tagger.tag("Crazy monkey jumping on the tree"))
Ejemplo n.º 4
0
def contentToList(page_content):
    list = sent_tokenize(page_content)
    # list = page_content.split(' ')
    print(list)
    cleanList = []
    list_with_startelement_numbers = []  # enthält Start item aller Redetexte
    list_with_startEnd_numbers = [
    ]  # enthält Start und Ende item aller Redetexte
    # hallo ihr
    # meine dfsdkfsdfsd

    for i in range(len(list)):
        list_element = list[i]
        list_element = list_element.replace("\n", "")
        list_element = list_element.replace("-", "")
        cleanList.append(list_element)  # liste ohne -, \n
        #print("item at index", i, ":", list_element)       # alle Listenelemente

        start_Element_Rede = 0
        '''analysiere Struktur list_element'''
        ''' nachdem Präsident Lammert das Wort übergibt, beginnt eine Rede'''
        matchers = ['Das Wort', 'das Wort']
        if any(m in list_element for m in matchers):
            print("item at index", i, ":",
                  list_element)  # Listenelemente, die matchers enthalten
            start_Element_Rede = i + 1
            list_with_startelement_numbers.append(start_Element_Rede)
            print("Start_Index_Redetext: ", start_Element_Rede)
            '''- POS -> PartOfSpeech Verben, Nomen, ... in Listenelement mit matchers'''
            words = word_tokenize(list_element)
            '''extracting Named Entities - Person, Organization,...'''
            jar = 'jars/stanford-postagger.jar'
            model = 'jars/german-hgc.tagger'
            pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
            text = pos_tagger.tag(pos_tagger)
            print(text)

            namedEnt = ne_chunk(tagged)
            print(namedEnt)

            #namedEnt.draw()

            def extract_entity_names(namedEnt):
                entityPers_names = []
                if hasattr(namedEnt, 'label') and namedEnt.label:
                    if namedEnt.label(
                    ) == 'PERSON':  #or namedEnt.label() == 'ORGANIZATION':
                        entityPers_names.append(' '.join(
                            [child[0] for child in namedEnt]))
                    else:
                        for child in namedEnt:
                            entityPers_names.extend(
                                extract_entity_names(child))
                return entityPers_names

            entityPerson_names = []
            entityPerson_names.extend(extract_entity_names(namedEnt))
            # Print all entity names
            print("Person: " + str(entityPerson_names))
            ''' Excel-sheet with all politicans '''
            workbook = xlrd.open_workbook('mdb.xls')
            worksheet = workbook.sheet_by_name('Tabelle1')
            # Value of 1st row and 1st column
            value_of_first_col_Names = []
            value_of_second_col_Party = []
            first_col_Names = worksheet.col_values(0)
            second_col_Party = worksheet.col_values(1)
            print(first_col_Names)
            print(second_col_Party)

            matchers = first_col_Names
            politican_name = ""
            party_name = ""
            for i in range(len(entityPerson_names)):
                list_element = entityPerson_names[i]
                for m in range(len(matchers)):
                    matcher_element = matchers[m]
                    if matcher_element in list_element:
                        print("listen_eintrag", i, ": ", list_element)
                        print("excel_eintrag_name", m, ": ", matcher_element)
                        print("excel_eintrag_partei", m, ": ",
                              second_col_Party[m])
                        politican_name = matcher_element
                        party_name = second_col_Party[m]
                        ''' Eintrag in DB Name + Partei'''
            ''' Anbindung API-Abgeordnetenwatch - JSON Data-Extract'''
            # import urllib.request, json
            # politican_name = politican_name.lower()
            # print(politican_name)
            # politican_name = politican_name.replace(' ','-')
            # print(politican_name)
            # with urllib.request.urlopen("https://www.abgeordnetenwatch.de/api/profile/"+politican_name+"/profile.json") as url:
            #     data = json.loads(url.read().decode())
            #     print(data)
            #     print(data['profile']['personal']['first_name']+ " " +data['profile']['personal']['last_name'])
            #     print(data['profile']['party'])
            ''' Eintrag in DB Name + Partei'''

    print("Liste mit Startnummern: ", list_with_startelement_numbers)
    # jede zweite Startnummer (= Ende) um 1 mindern für Ende einer Rede
    # [start:end:stop]
    # print(list_with_startelement_numbers[1::2])
    for value in range(1, len(list_with_startelement_numbers), 2):
        list_with_startelement_numbers[
            value] = list_with_startelement_numbers[value] - 1
        #print(list_with_startelement_numbers)
    list_with_startEnd_numbers = list_with_startelement_numbers  # list_with_startEnd_numbers enthält Start und Ende item(Nummern) aller Redetexte
    print("Liste mit Start + Endnummern: ", list_with_startEnd_numbers)

    for item in range(len(cleanList)):
        element = cleanList[item]
        #print("item at index", item, ":", element)

    alle_Reden = []
    x = 0
    y = 1
    start = 1
    print(len(list_with_startEnd_numbers))
    end = len(list_with_startEnd_numbers) - 1
    active = True
    while active:
        print("x: ", x)
        print("y: ", y)
        print("start: ", start)
        if start > end:
            active = False
            print("false")
        else:
            alle_Reden.append(cleanList[
                list_with_startEnd_numbers[x]:list_with_startEnd_numbers[y]]
                              )  # [alle zwischen Start:Ende]
            #print("weiter")
            #print("start: ", start)
        x += 2
        y += 2
        start += 2

    # Ausgabe aller Reden
    for rede in alle_Reden:
        print(rede)
        print("\n")
Ejemplo n.º 5
0
nltk.ne_chunk(tags_tofu)

qq_15 = qqExample.text[39579]
not_1 = sent_tokenize(qq_15)[2]
sentenceClean(qq_15)

nltk.pos_tag(
    word_tokenize(
        'it is a clean and beautiful restaurant otherwise with average service and very well kept restrooms'
    ))
nltk.pos_tag(word_tokenize('i would definitely not recommend this restaurant'))
nltk.pos_tag(word_tokenize('i won\'t like this restaurant'))
nltk.pos_tag(word_tokenize('this was really great!'))
nltk.pos_tag(word_tokenize('boring,flavorless and spicy'))
nltk.pos_tag(word_tokenize('it is boring,flavorless and spicy'))
st.tag(word_tokenize('boring,flavorless and spicy'))

home = expanduser("~")
_path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger/stanford-postagger.jar'

st = StanfordPOSTagger(_path_to_model, _path_to_jar)

nltk.pos_tag(
    word_tokenize('the tofu is pretty cold, flavorless, and a bit slimy.'))
st.tag(word_tokenize('the tofu is pretty cold, flavorless, and a bit slimy.'))

nltk.pos_tag(word_tokenize('i would definitely not recommend this restaurant'))
st.tag(word_tokenize('it\'s over-cooked!'))

nltk.pos_tag(word_tokenize(qq7))
Ejemplo n.º 6
0
class ActionListGenerator:
    def __init__(self,sentence, graph):
        self.Construct_Pattern_House()
        self.sentence = sentence
        self.rdfgraph = graph
       # self.sentence = sentence
        self.st = StanfordPOSTagger('chinese-distsim.tagger')
        self.nodecount = dict()

    def Construct_Pattern_House(self):
        self.patterns = []
        self.patterns.append([u'当 (N) (V) (N) 时', 'event'])
        self.patterns.append([u'{哪} () [的]{0,1} (N) [的]{0,1} 股价 {涨幅} [会]{0,1} [最大|最多]', 'stock_increase'])
        self.patterns.append([u'{哪} (N) 股 [的|将]{0,1}  {涨} [会]{0,1} [得]{0,1}  [最大|最多]', 'specific_type_stock_increase'])

    def Generate(self):
        self.words = jieba.cut(self.sentence)
        self.sentence2 = ' '.join(list(self.words))
        self.pos = self.st.tag(self.sentence2.split())

        self.senpos = [(sp.split('#')[0], sp.split('#')[1]) for _, sp in self.pos]
        print self.sentence2
        print self.pos

        self.actions = ActionList(self.rdfgraph)

        for pat in self.patterns:
            self.match(self.senpos, pat[0], pat[1])

        print self.actions

    def GetCount(self, pattype):
        if pattype in self.nodecount:
            ID = self.nodecount[pattype]
            self.nodecount[pattype] += 1
            return ID
        else:
            self.nodecount[pattype] = 1
            return 0


    def match(self, senpos, pattern, pattype):
        patarr = pattern.split()
        paralist = []
        i=0
        canmatch = True
        while i < len(senpos):
            canmatch = True
            regextra = 0
            j = 0
            while j < len(patarr):
                if patarr[j][0]=='(':
                    if patarr[j][1:-1] in senpos[i+j + regextra][1]:
                        paralist.append(senpos[i+j + regextra][0])
                    else:
                        canmatch = False
                        break
                elif patarr[j][0]=='[':
                    contentstr = patarr[j].split(']')[0][1:]
                    contents = contentstr.split('|')
                    if patarr[j][-1]=='}':
                        times = patarr[j].split('{')[1][:-1].split(',')
                        minimum_allowed_occurance = int(times[0])
                        maximum_allowed_occurance = int(times[1])
                        repeat = 0
                        for repeatednum in range(minimum_allowed_occurance, maximum_allowed_occurance + 1):
                            if senpos[i + j + regextra + repeatednum][0] in contents:
                                repeat = repeatednum
                            else:
                                if repeatednum == 0:
                                    regextra -= 1
                                else:
                                    regextra += repeat
                                break
                    else:
                        if senpos[i + j + regextra][0] in contents:
                            pass
                        else:
                            canmatch = False
                            break

                elif patarr[j][0]=='{':
                    content = patarr[j][1:-1]
                    if content in senpos[i+j + regextra][0]:
                        pass
                    else:
                        canmatch = False
                        break


                elif patarr[j] == senpos[i+j + regextra][0]:
                    pass
                else:
                    canmatch = False
                    break

                j+=1

            if canmatch:
                break
            else:
                paralist = []

            i += 1




        ID = lambda x: str(self.GetCount(x))
        if pattype == 'event':
            if len(paralist) != 3 or not canmatch:
                return []

            tid =  ID('t')

            res  = ['SELECT ?t'+ tid, "  WHERE   ", "{ "]
            NodeID = ID(pattype)
            res.append('?event'+NodeID + ' <http://www.example.org/subject>  \"' + paralist[0]+'\" .')
            res.append('?event'+NodeID + ' <http://www.example.org/trigger> \"' + paralist[1]+'\" .')
            res.append('?event'+NodeID + ' <http://www.example.org/object> \"' + paralist[2]+'\" .')
            res.append('?event'+NodeID + ' <http://www.example.org/time>  ?t' + tid + '  .')
            res.append('}')


            command = '\n'.join(res)

            act = Action('sparql')
            act.setCommand(command)
            act.inputtype = 'None'
            act.keydict['subject'] = paralist[0]
            act.returntype = 'value'
            self.actions.add(act)


        elif pattype == 'stock_increase':
            if  not canmatch:
                return []

            if len(paralist) == 1:
                companyname = self.actions[-1].keydict['subject']
                pass
            elif len(paralist) == 2:
                companyname = paralist[0]
                pass

            res = ['SELECT ?support ?p  ', "WHERE   ", "{ "]
            NodeID = ID('company')
            res.append('?company'+NodeID + ' <http://www.example.org/support>  ?support .')
            res.append('?company'+NodeID + ' <http://www.example.org/name> \"' + companyname +'\" .')
            supportNodeID = ID('supportnode')
            stockNodeID = ID('stocknode')
            res.append('?supportnode'+supportNodeID + ' <http://www.example.org/name>  ?support .')
            res.append('?supportnode'+supportNodeID + ' <http://www.example.org/stock>  ?stock'+stockNodeID + ' .')
            res.append('?stock'+stockNodeID + ' <http://www.example.org/stocktime>  \"%s\" .')
            res.append('?stock'+stockNodeID + ' <http://www.example.org/price>  ?p .')
            res.append('}')
            command = '\n'.join(res)

            act = Action('sparql')
            act.inputtype = 'timestamp'
            act.setCommand(command)
            self.actions.add(act)

            act1 = copy.deepcopy(act)
            act1.inputtype = 'latertimestamp'
            self.actions.add(act1)

            actminus = Action('minus')
            actminus.inputtype='table'
            self.actions.add(actminus)


            actmax = Action('max')
            actmax.inputtype='table'
            self.actions.add(actmax)

        elif pattype == 'specific_type_stock_increase':
            if  not canmatch:
                return []

            stocktype = paralist[0]

            res = ['SELECT ?company ?p  ', "WHERE   ", "{ "]
            companyNodeID = ID('company')
            stockNodeID = ID('stocknode')
            res.append('?companynode' + companyNodeID + ' <http://www.example.org/name>  ?company .')
            res.append('?companynode' + companyNodeID + ' <http://www.example.org/stock>  ?stock' + stockNodeID + ' .')
            res.append('?companynode' + companyNodeID + ' <http://www.example.org/type>  \"' + stocktype + '\" .')
            res.append('?stock' + stockNodeID + ' <http://www.example.org/stocktime>  \"%s\" .')
            res.append('?stock' + stockNodeID + ' <http://www.example.org/price>  ?p .')
            res.append('}')
            command = '\n'.join(res)

            act = Action('sparql')
            act.inputtype = 'timestamp'
            act.setCommand(command)
            self.actions.add(act)

            act1 = copy.deepcopy(act)
            act1.inputtype = 'latertimestamp'
            self.actions.add(act1)

            actminus = Action('minus')
            actminus.inputtype='table'
            self.actions.add(actminus)


            actmax = Action('max')
            actmax.inputtype='table'
            self.actions.add(actmax)
 sys.stdout.write("\t")
 for tok in token2:
    sys.stdout.write("\t")
    sys.stdout.write(tok.rjust(8))
 print()
 for j in range(0,len(v.state)):
     sys.stdout.write(v.state[j])
     sys.stdout.write("\t")
     for i in range(0,len(token2)):
         sys.stdout.write("\t")
         sys.stdout.write(str(round((Viterbi_matrix2[i][j]),5)))
         sys.stdout.write("\t")
     print()
     
 print("--------------------------------------------------------------------------------")
 
 #Stanford POS Tagging
 stanford_dir = "C:/stanford-postagger/" # change it into your own path
 model_file= stanford_dir + 'models/english-left3words-distsim.tagger'
 jarfile = stanford_dir +"stanford-postagger.jar"# jar file
 st = StanfordPOSTagger(model_filename=model_file, path_to_jar=jarfile)
 
 print("\nSentence 1: "+seq1)
 tokens1 = word_tokenize(seq1) # tokenize into words
 print("Using Stanford POS Tagging, Sentence 1 is tagged as: ")
 print(st.tag(seq1.split()))
 
 print("\nSentence 2: "+seq2)
 tokens2 = word_tokenize(seq2) # tokenize into words
 print("Using Stanford POS Tagging, Sentence 2 is tagged as: ")
 print(st.tag(seq2.split()))
Ejemplo n.º 8
0
import spacy

home = expanduser("~")
_path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger/stanford-postagger.jar'

st = StanfordPOSTagger(_path_to_model, _path_to_jar)

qqExample = pd.read_csv('/Applications/Study/UWM/628/module2/qq.csv',
                        index_col=0)
qqExample.index = range(0, len(qqExample))

i = 3
qqExample.text[i]
nltk.pos_tag(word_tokenize(qqExample.text[i]))
st.tag(word_tokenize(qqExample.text[i]))
st.tag_sents([sent_tokenize(qqExample.text[i])])

qqAll = '. '.join(qqExample.text)
len(qqAll)
nltk.pos_tag(word_tokenize(qqAll))
st.tag(word_tokenize(qqAll))
st.tag_sents([sent_tokenize(qqAll)])

test = pd.read_csv(
    '/Applications/Study/UWM/628/module2/textUsing/chineseAllReview.csv')
test.head(5)

testAll = '. '.join(test.text)
len(testAll)
mangyiba = st.tag(word_tokenize(testAll))
Ejemplo n.º 9
0
qqExample['text'] = qqExample['text'].apply(sentenceClean2)
qqExample.to_csv('/Applications/Study/UWM/628/module2/textUsing/transqq2.csv')

#nltk.pos_tag(word_tokenize('jason is a really nice guy.'))
tagList = [
    'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
    'VBZ'
]
nounList = ['NN', 'NNS']


def detact_noun(self):
    testBag = nltk.pos_tag(word_tokenize(self))
    selectedBag = list(
        pd.Series(list(dict(testBag).keys())).iloc[np.where(
            [x in nounList for x in list(dict(testBag).values())])])
    return selectedBag


qqExample = pd.read_csv(
    '/Applications/Study/UWM/628/module2/textUsing/transqq2.csv', index_col=0)
qqExample.index = range(0, len(qqExample))

detact_noun(qqExample.text[41])
nltk.pos_tag(word_tokenize(qqExample.text[41]))
st.tag(word_tokenize(qqExample.text[41]))

qqExample['text'] = qqExample['text'].apply(detact_noun)
qqExample.head(5)
qqExample.to_csv('/Applications/Study/UWM/628/module2/textUsing/nounsqq.csv')
from nltk import StanfordPOSTagger

text = ''' الفيتامينات هي عناصر غذائيّة أساسية لجسم الإنسان، وهي عبارة عن مركبات عضويّة توجد طبيعيّاً في الأغذية ويحتاجها الجسم بكميّات بسيطة 
للقيام بوظائفه الطبيعية، ولا يستطيع الجسم تصنيعها أو تصنيع كميّات كافية منها لتلبي احتياجاته'''

Tagger = StanfordPOSTagger(
    './stanfor arabic modeal and tagger/arabic.tagger',
    './stanfor arabic modeal and tagger/stanford-postagger.jar')
output = Tagger.tag(text.split())
output = [tuple(filter(None, tp)) for tp in output]  #remove empty tubles

for data in output:
    print(data[0].split("/")[0] + " > " + data[0].split("/")[1] + "\n")

# References:‬
# ‏‪1. Stanford Arabic part-of-speech tagset‬
# ‏‪https://www.sketchengine.co.uk/stanford-arabic-part-of-speech-tagset/‬
# ‏‪2. Stanford POS tagger‬
# ‏‪https://nlp.stanford.edu/software/pos-tagger-faq.html#tagset‬