def named_entities(sentence):
    st = StanfordNERTagger(
        model_filename=
        '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
        path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    tags = st.tag(word_tokenize(sentence))

    # clean up the result from the tagger
    prev_tag_name = str(tags[0][1])
    cur_entity = str(tags[0][0])
    entities = {}
    for i in range(1, len(tags)):
        cur_tag = tags[i]
        cur_token = str(cur_tag[0])
        cur_tag_name = str(cur_tag[1])
        if cur_tag_name == prev_tag_name:
            cur_entity = cur_entity + " " + cur_token
        else:
            if not prev_tag_name in entities:
                entities[prev_tag_name] = []
            # change encoding, another way is to .encode('ascii','ignore')
            entities[prev_tag_name].append(str(cur_entity))
            cur_entity = cur_token
        prev_tag_name = cur_tag_name
    if 'O' in entities:
        del entities['O']  # not needed, 'O' means not a named entity
    return entities
Ejemplo n.º 2
0
    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        tags = [nt for nt in NERTags]
        ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"]
               for t in tags]

        phrases = []
        for i, t in zip(ids, tags):
            phrase = ""
            tt = "N/A"
            for p, index in zip(i, range(len(i))):
                if index == len(i) - 1:
                    phrase += "{}".format(t[p][0])
                    tt = phrase, t[p][1]
                else:
                    phrase += "{} ".format(t[p][0])

            phrases.append(tt)
        return phrases
Ejemplo n.º 3
0
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file,
                        output_file):
    jarfile = stanford_dir + jarfile
    modelfile = stanford_dir + modelfile

    st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
    i = 0
    tagged_ne = []
    with open(tag_this_file, "r") as f:
        for line in f:
            line = line.split()
            i += 1
            if len(line) > 0:
                tagged_ne.append(line[0])
            else:
                # Remove the SENENDs from the output file afterwards. Needed to keep the format consistent
                # Keep in mind, that some "/" are still removed. Is replace in postprecessing step.
                tagged_ne.append("SENEND")
    print(tagged_ne)

    # Tag the file using Stanford NER
    out = st.tag(tagged_ne)

    # Write the results to a tsv file
    with open(output_file, "w") as f:
        for i in out:
            f.write(str(i[0]) + "\t" + i[1] + "\n")
def getNER(sent):
    st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    NerSen = st.tag(sent.split())
    Entities = {}
    EntityName = "";
    length = len(NerSen)
    for i in range(length):
        if NerSen[i][1] != 'O':
            if i >0 and NerSen[i - 1][1] != 'O':
                EntityName += '_'
            EntityName += NerSen[i][0]
            if i == length-1 or (i < length-1 and NerSen[i + 1][1] == 'O'):
                Entities[EntityName] = NerSen[i][1]
                EntityName = ""
    words=nltk.word_tokenize(sent)
    pos_tags =nltk.pos_tag(words)
    EntityKey = []
    for key in Entities.keys(): EntityKey.append(key)
    for token,tag in pos_tags:
        if tag == "NNP":
            isContain = False
            for key in EntityKey:
                if token in key:
                    isContain = True
            if not isContain:
                Entities[token] = "NNP"
    return Entities
Ejemplo n.º 5
0
    def __init__(self, articleText):
        self.blob = textblob.TextBlob(articleText)
        #blob.tags
        keywords = [
            x[0] for x in self.blob.tags
            if "NNP" in x[1] or "NN" in x[1] or "CD" in x[1]
        ]
        self.keywords = set(keywords)
        self.nounPhrases = Counter(self.blob.noun_phrases).most_common()
        st = StanfordNERTagger(
            'D:/Source/Newspeek/Newspeek/news/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz',
            'D:/Source/Newspeek/Newspeek/news/stanford-ner-2014-06-16/stanford-corenlp-caseless-2015-04-20-models.jar'
        )
        self.namedEntities = dict((a.lower(), b) for a, b in set(
            [x for x in st.tag(articleText.split()) if x[1] != 'O']))
        self.namedPhrases = {}

        for np in self.nounPhrases:
            tags = []
            for word in np[0].split():
                tag = 'O'
                if word.lower() in self.namedEntities.keys():
                    tag = self.namedEntities[word.lower()]
                tags.append(tag)

            np_tag = Counter(tags).most_common(1)[0][0]
            if np_tag != 'O':
                self.namedPhrases[np[0].lower()] = np_tag

        pass
Ejemplo n.º 6
0
def stanfordNer():
    f = open("federer.txt", "r")
    text = f.read()

    jar = './stanford-ner-tagger/stanford-ner.jar'
    model = './stanford-ner-tagger/english.muc.7class.distsim.crf.ser.gz'

    ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
    words = nltk.word_tokenize(text)
    color_print('\nStanfordNER', color='blue', bold=True, underline=True)
    color_print('\nPrepoznati entiteti:\n',
                color='yellow',
                bold=True,
                underline=True)

    table = PrettyTable(["Prepoznati entitet", "Tip entiteta"])

    for token, tag in ner_tagger.tag(words):
        if tag != "O":
            table.add_row([token, tag])
    print(table)

    with open("Rezultati_StanfordNER.txt", "w", encoding="utf8") as text_file:
        text_file.write("Prepoznati entiteti: \n\n%s" % table)

    print("Rezultati sačuvani u fajl Rezultati_StanfordNER.txt")
Ejemplo n.º 7
0
    def out(self):
        stanford_dir = '/home/gary/stanford-ner-2015-04-20/'
        jarfile = stanford_dir + 'stanford-ner.jar'
        modelfile = stanford_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
        st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
        tokenized_text = word_tokenize(self.text)
        classified_text = st.tag(tokenized_text)

        #return organization names
        regexNN = re.compile('ORGANIZATION')
        org_list = list()
        i = 0
        while i < len(classified_text):
            print(i)
            item = classified_text[i]
            if regexNN.search(item[1]):
                n = i + 1
                company = item[0]

                while n < len(classified_text):
                    item = classified_text[n]
                    if regexNN.search(item[1]):
                        company = company + " " + item[0]
                        n = n + 1
                    else:
                        break
                i = n
                org_list.append(company)
            i = i + 1

        return org_list
Ejemplo n.º 8
0
def main():
    try:
        filename = sys.argv[1]
        f = open(filename, "r")
    except IndexError:
        print(
            "You probably didn't specify an input file. Correct format python3 ass5.py <InputFileName>"
        )
        exit()
    except FileNotFoundError:
        print(
            "The file you specified does not exist. Please check and try again."
        )
        exit()
    inputs = f.readlines()

    jar = './stanford-ner.jar'
    model = './ner-model-english.ser.gz'
    ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

    file2 = open("output.txt", "w")

    for sentence in inputs:
        words = nltk.word_tokenize(sentence)

        for x in ner_tagger.tag(words):
            file2.write("[" + x[0] + ", " + x[1] + "], ")
        file2.write('\n')
def tagSent(sent):
    words=nltk.word_tokenize(sent)
    pos_tags =nltk.pos_tag(words)
    EntityKey = []
    Entities = {}
    EntityName = ""
    length = len(pos_tags)
    NounTag = ["NN","NNS","CD"]
    for i in range(length):
        if pos_tags[i][1] == 'NNP':
            if i >0 and pos_tags[i - 1][1] == 'NNP':
                EntityName += '_'
            EntityName += pos_tags[i][0]
            if i == length-1 or (i < length-1 and pos_tags[i + 1][1] != 'NNP'):
                Entities[EntityName] = pos_tags[i][1]
                EntityName = ""
        if pos_tags[i][1] in NounTag:
            EntityName += pos_tags[i][0]
            j=i-1
            while j>=0 and pos_tags[j][1] == "JJ":
                EntityName = pos_tags[j][0] +" "+EntityName
                j -= 1
            Entities[EntityName] = "OTHER"
            EntityName=""

    st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    NerSen = st.tag(sent.split())
    for token,type in Entities.items():
        for word,tag in NerSen:
            if (word in token) and ("PERSON" == tag):
                Entities[token] = "PERSON"
    return Entities
Ejemplo n.º 10
0
def extract_character_names(book_contents):
    data = CharacterData()

    lines = sent_tokenize(book_contents)
    tagger = StanfordNERTagger(
        "./stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz",
        "./stanford-ner/stanford-ner.jar")

    discovered_names_in_line = set()
    for line in lines:
        words = word_tokenize(line)
        taggd_token = tagger.tag(words)

        name = ""
        first_name = ""
        discovered_names_in_line.clear()
        for word, tag in taggd_token:
            if tag == "PERSON":
                if name == "":
                    first_name = word
                name += word + " "
            else:
                if name != "":
                    name = name.strip()
                    if first_name not in discovered_names_in_line:
                        data.add_line_for_name(first_name, name, line)
                        discovered_names_in_line.add(first_name)
                    name = ""
                    first_name = ""

    return data
Ejemplo n.º 11
0
    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        n = []
        for nt in NERTags:
            n.extend(nt)

        ids = []
        #get the indexes of all words that have NER tags
        ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"]
        a = np.array(ids)

        consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1)

        phrases = []
        for ci in consecutive_ids:
            phrase = ""
            tag = ""
            for id_ in ci:
                phrase += "{} ".format(n[id_][0])

            tag += "{}".format(n[id_][1])
            phrases.append(phrase)

        cleaned_phrases = self.del_repeat(phrases)
        return cleaned_phrases
Ejemplo n.º 12
0
    def __init__(self, classifier_path=None, ner_path = None, sutime_jar_path = None):
        # Change the path according to your system
        if classifier_path is None:
            classifier_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.muc.7class.distsim.crf.ser.gz"

        if ner_path is None:
            ner_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\stanford-ner.jar"

        if sutime_jar_path is None:
            sutime_jar_path = "C:\stanford_corenlp\stanford-corenlp-full-2018-02-27\stanford-corenlp-full-2018-02-27"

        self.stanford_classifier = classifier_path
        self.stanford_ner_path = ner_path
        self.sutime_path = sutime_jar_path

        # Creating Tagger Object
        self.st = StanfordNERTagger(self.stanford_classifier, self.stanford_ner_path)
        self.su = SUTime(jars=self.sutime_path, mark_time_ranges=True, include_range=True)

        self.weather_terms = ["weather", "climate", "precipitation", "sun", "rain", "cloud","snow", "hot", "humid", "cold", "sunny", "windy","cloudy",
                              "rainy", "snowy", "misty", "foggy", "colder","hotter", "warmer", "pleasant"]
        self.greet_terms= ["hello","hey","howdy","hello","hi", "yo", "yaw"]
        self.closure_terms = ["no", "nope", "thank you", "bye", "tata", "thanks", "that will be all", "that's it", "that'll be all"]
        self.day_terms = ["dawn", "dusk", "morning", "evening", "noon","afternoon", "night", "tonight", "midnight", "midday"] #, "hours"]
        self.date_terms = ["today", "tomorrow", "yesterday"]
Ejemplo n.º 13
0
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file,
                        output_file):
    jarfile = stanford_dir + jarfile
    modelfile = stanford_dir + modelfile

    #st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
    stanford_classifier = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
    stanford_ner_path = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\stanford-ner.jar'

    # Creating Tagger Object
    st = StanfordNERTagger(stanford_classifier,
                           stanford_ner_path,
                           encoding='utf-8')

    i = 0
    tagged_ne = []
    with open(tag_this_file, "r") as f:
        for line in f:
            line = line.split()
            i += 1
            if len(line) > 0:
                tagged_ne.append(line[0])
            else:
                # Remove the SENENDs from the output file afterwards. Needed to keep the format consistent
                # Keep in mind, that some "/" are still removed. Is replace in postprecessing step.
                tagged_ne.append("SENEND")
    print(tagged_ne)

    # Tag the file using Stanford NER
    out = st.tag(tagged_ne)

    # Write the results to a tsv file
    with open(output_file, "w") as f:
        for i in out:
            f.write(str(i[0]) + "\t" + i[1] + "\n")
Ejemplo n.º 14
0
 def Name_Entity_recognition2(f2):
     st = StanfordNERTagger(os.environ['STANFORD_CLASSIFIER'],
                            os.environ['STANFORD_NER_PATH'],
                            encoding='utf-8')
     st.java_options ='-mx1000m'
     word = word_tokenize(f2)
     classified_text = st.tag(word)
     return classified_text
Ejemplo n.º 15
0
def o_tag():
    """Returns a noun with a tag if the tag is unfindable or a location"""
    sttag = StanfordNERTagger(
        'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        'stanford-ner-2014-06-16/stanford-ner-3.4.jar')
    sttags = sttag.tag(get_nouns())
    return [
        sttag for sttag in sttags if sttag[1] == 'O' or sttag[1] == "LOCATION"
    ]
Ejemplo n.º 16
0
def NERTag(text):
    os.environ['CLASSPATH'] = "C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar"
    os.environ['STANFORD_MODELS'] = "C:/Users/1/James/stanford-parser-full-2015-12-09"
    os.environ['JAVAHOME'] = "C:/Program Files/Java/jdk1.8.0_102"
    ner = StanfordNERTagger('C:/Users/1/James/stanford-ner-2015-12-09/classifiers/'
                            'english.all.3class.distsim.crf.ser.gz',
                   'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar')
    r= ner.tag(text)
    return r
def namedEntityMatch(row):
    #os.environ["JAVA_HOME"]="C:\Program Files\Java\jdk1.8.0_151"
    if "JAVA_HOME" not in os.environ:
        print(
            "Please set the value of JAVA_HOME environment variable, or install java in your machine"
        )
        sys.exit(-1)

    ner = StanfordNERTagger(
        r"stanford-ner-2014-06-16\classifiers\english.all.3class.distsim.crf.ser.gz",
        r"stanford-ner-2014-06-16\stanford-ner.jar")

    ques1Entities = ner.tag(str(row['question1']).lower().split())
    ques2Entities = ner.tag(str(row['question2']).lower().split())
    entityDict1 = {}
    entityDict2 = {}
    for entity in entityDict1:
        if entity[1] != "0":
            if entity[1] in entityDict1:
                entityDict1[entity[1]].append(entity[0])
            else:
                nameList = []
                nameList.append(entity[0])
                entityDict1[entity[1]] = nameList

    for entity in entityDict2:
        if entity[1] != "0":
            if entity[1] in entityDict2:
                entityDict2[entity[1]].append(entity[0])
            else:
                nameList = []
                nameList.append(entity[0])
                entityDict2[entity[1]] = nameList

    if len(entityDict1) == 0 or len(entityDict2) == 0:
        return 0

    totalCount = 0
    matchCount = 0
    for key in entityDict1:
        entityList1 = entityDict1[key]
        if key in entityDict2:
            entityList2 = entityDict2[key]
            for item in entityList1:
                if item in entityList2:
                    matchCount += 1
                totalCount += 1
    for key in entityDict2:
        entityList2 = entityDict2[key]
        if key in entityDict1:
            entityList1 = entityDict1[key]
            for item in entityList2:
                if item in entityList1:
                    matchCount += 1
                totalCount += 1

    return float(matchCount) / float(totalCount)
Ejemplo n.º 18
0
def NERFunc(file_path,fileName,G):
    os.environ["STANFORD_MODELS"] = os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20")
    st = StanfordNERTagger(os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20","classifiers","english.all.3class.distsim.crf.ser.gz"),
                          os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20","stanford-ner.jar" ))
    #print "Java error path JAVAHOME=" + str(os.environ["JAVAHOME"]) 
    #print "Java error path JAVA_HOME=" + str(os.environ["JAVA_HOME"])
    for index, file_path in enumerate(file_path):
        node_name = fileName[index]
        data = readFromFile(file_path)
        if data is not None:
          
                netagged_words = st.tag(data.split())
                for tag, chunk in groupby(netagged_words, lambda x:x[1]):
                    if tag != "O":
                        entity = " ".join(w for w, t in chunk)
                        if  entity != "":
                            entity = entity.encode('utf-8')
                            entity = re.sub(r'[^\x00-\x7F]+', ' ', entity)
                            entity = entity.lower()
                            no_punctuation = entity.translate(None, string.punctuation)
                            entity=re.sub("[^a-zA-Z]+", " ", no_punctuation)
                            #print("Tag = "+ tag+" entity = "+ entity)
                            #If this topic doesn't exist as a node then add it
                            if entity not in G.nodes():
                                G.add_node(entity)
                                G.node[entity]['type'] = graphConstants.TYPE_NER
                            #If the edge between this doc and entity is already present or not
                            if G.has_edge(node_name,entity) is False:
                                G.add_edge(node_name,entity, weight = 1)
                            else:
                                G[node_name][entity]["weight"] = G[node_name][entity]["weight"] + 1
                                graphUtils.logger.info("entity topic entity = "+entity + " document ="+node_name)
                            if G.has_edge(entity,node_name) is False:
                                G.add_edge(entity,node_name, weight = 1)
                            else:
                                G[entity][node_name]["weight"] = G[entity][node_name]["weight"] + 1
                            
                            topics = entity.split()
                            if(len(topics) > 1):
                                for word in entity.split():
                                    #Only change weight if this topic already exists
                                    if word in G.nodes():
                                        #If the edge between this doc and topic is already present or not
                                        if G.has_edge(node_name,word) is False:
                                            G.add_edge(node_name,word, weight = 1)
                                        else:
                                            G[node_name][word]["weight"] = G[node_name][word]["weight"] + 1
                                        if G.has_edge(word,node_name) is False:
                                            G.add_edge(word,node_name, weight = 1)
                                        else:
                                            G[word][node_name]["weight"] = G[word][node_name]["weight"] + 1
                                        graphUtils.logger.info("entity topic word = "+word + " document ="+node_name)
                                        
                                
                                
    return G
Ejemplo n.º 19
0
 def __init__(self, cslm, transitions, tags):
     self.cslm = cslm
     self.transitions = transitions
     self.tags = tags
     self.engClassifier = StanfordNERTagger(
         "../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz",
         "../stanford-ner-2015-04-20/stanford-ner.jar")
     self.spanClassifier = StanfordNERTagger(
         "../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz",
         "../stanford-ner-2015-04-20/stanford-ner.jar")
Ejemplo n.º 20
0
def NERWithOldStanford(input_sample):
    java_path = "C:\Program Files (x86)\Common Files\Oracle\Java\javapath\java.exe"  #"C:/Program Files/Java/jdk1.8.0_161/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz',
                               'stanford-ner.jar',
                               encoding='utf-8')
    tokenized_text = word_tokenize(input_sample)
    classified_paragraphs_list = tagger.tag_sents([tokenized_text])
    formatted_result = formatted_entities(classified_paragraphs_list)
    return formatted_result
 def __init__(self, cslm, transitions, tags):
     self.cslm = cslm
     self.transitions = transitions
     self.tags = tags
     self.engClassifier = StanfordNERTagger(
         "../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz",
         "../stanford-ner-2015-04-20/stanford-ner.jar")
     self.spanClassifier = StanfordNERTagger(
         "../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz",
         "../stanford-ner-2015-04-20/stanford-ner.jar")
Ejemplo n.º 22
0
def nermodel(jsondata):

    #download stanford NER from https://nlp.stanford.edu/software/CRF-NER.html#Download
    # go to stanford-ner.jar path in stanfordNER model
    ner_tagger = StanfordNERTagger(
        "/home/ubuntu/Documents/node-python/simple-express/python_scripts/bcm-model.ser.gz",
        "/home/ubuntu/Documents/stanford-ner-2018-10-16/stanford-ner.jar",
        encoding='utf8')
    result = ner_tagger.tag(jsondata.split())
    print(json.dumps({"output": result}))
Ejemplo n.º 23
0
def get_ner_tags(answer):

    #download teh stanford ner zip file and extract it .
    #Change the directory location in the folllowing path
    st = StanfordNERTagger(
        '/Users/shubhambarhate/Desktop/project3/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
        '/Users/shubhambarhate/Desktop/project3/stanford-ner-2017-06-09/stanford-ner.jar'
    )

    return st.tag(context_list[0].split())
Ejemplo n.º 24
0
def ner_tag():
    """Returns a noun with a tag if the tag is person or organization"""
    sttag = StanfordNERTagger(
        'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        'stanford-ner-2014-06-16/stanford-ner-3.4.jar')
    sttags = sttag.tag(get_nouns())
    return [
        sttag for sttag in sttags
        if sttag[1] == "PERSON" or sttag[1] == "ORGANIZATION"
    ]
Ejemplo n.º 25
0
def NERFunc(data, G, node_name):
    os.environ["STANFORD_MODELS"] = os.path.join(graphConstants.ROOT_FOLDER,
                                                 "stanford-ner-2015-04-20")
    st = StanfordNERTagger(
        os.path.join(graphConstants.ROOT_FOLDER, "stanford-ner-2015-04-20",
                     "classifiers", "english.all.3class.distsim.crf.ser.gz"),
        os.path.join(graphConstants.ROOT_FOLDER, "stanford-ner-2015-04-20",
                     "stanford-ner.jar"))
    if data is not None:
        for sentence in nltk.sent_tokenize(data):
            netagged_words = st.tag(sentence.split())
            for tag, chunk in groupby(netagged_words, lambda x: x[1]):
                if tag != "O":
                    entity = " ".join(w for w, t in chunk)
                    if entity != "":
                        entity = entity.encode('utf-8')
                        entity = re.sub(r'[^\x00-\x7F]+', ' ', entity)
                        entity = entity.lower()
                        no_punctuation = entity.translate(
                            None, string.punctuation)
                        entity = re.sub("[^a-zA-Z]+", " ", no_punctuation)
                        #print("Tag = "+ tag+" entity = "+ entity)
                        #If this topic doesn't exist as a node then add it
                        if entity not in G.nodes():
                            continue
                        #If the edge between this doc and entity is already present or not
                        if G.has_edge(node_name, entity) is False:
                            G.add_edge(node_name, entity, weight=1)
                        else:
                            G[node_name][entity][
                                "weight"] = G[node_name][entity]["weight"] + 1
                            print "Recomm entity = " + entity + " document =" + node_name
                        if G.has_edge(entity, node_name) is False:
                            G.add_edge(entity, node_name, weight=1)
                        else:
                            G[entity][node_name][
                                "weight"] = G[entity][node_name]["weight"] + 1

                        topics = entity.split()
                        if (len(topics) > 1):
                            for word in entity.split():
                                #Only change weight if this topic already exists
                                if word in G.nodes():
                                    #If the edge between this doc and topic is already present or not
                                    if G.has_edge(node_name, word) is False:
                                        G.add_edge(node_name, word, weight=1)
                                    else:
                                        G[node_name][word]["weight"] = G[
                                            node_name][word]["weight"] + 1
                                    if G.has_edge(word, node_name) is False:
                                        G.add_edge(word, node_name, weight=1)
                                    else:
                                        G[word][node_name]["weight"] = G[word][
                                            node_name]["weight"] + 1
                                    print "Recomm entity topic word = " + word + " document =" + node_name
Ejemplo n.º 26
0
    def __init__(self):
        """
        Open client for Stanford NERTagger
        :return: protocol open
        """

        ser_path = get_project_path(
        ) + '/nltk_libs/english.all.3class.distsim.crf.ser'
        jar_path = get_project_path() + '/nltk_libs/stanford-ner-3.8.0.jar'

        self.st = StanfordNERTagger(ser_path, jar_path)
Ejemplo n.º 27
0
def get_NER_Tagger(content):
    NER_classifier = "/Users/aparnaghosh87/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz"
    os.environ[
        'CLASSPATH'] = "/Users/aparnaghosh87/Downloads/stanford-ner-2014-06-16"
    st = StanfordNERTagger(NER_classifier, encoding='utf-8')
    tokenized_text = word_tokenize(content)
    classified_text = st.tag(tokenized_text)
    # output looks like this:
    # [('While', 'O'), ('in', 'O'), ('France', 'LOCATION'), (',', 'O'), ('Christine', 'PERSON'), ('Lagarde', 'PERSON'), ('discussed', 'O'), ('short-term', 'O'), ('stimulus', 'O'), ('efforts', 'O'), ('in', 'O'), ('a', 'O'), ('recent', 'O'),
    # ('interview', 'O'), ('with', 'O'), ('the', 'O'), ('Wall', 'O'), ('Street', 'O'), ('Journal', 'O'), ('.', 'O')]
    return classified_text
Ejemplo n.º 28
0
def getTheNamedEntities(text):
    st = StanfordNERTagger(StanfordNERClassifierPath, StanfordNERjarPath)

    lstTag = st.tag(text.split())

    result = {}
    for tag in lstTag:
        #if str(tag[1]).lower() != 'o': #### CAUTION - DROPPING the General Term 'Object' items
        result [str(str(tag[0])).replace('.','')] = str(tag[1])
        
    return result
Ejemplo n.º 29
0
    def __init__(self, penalty: float, threshold: float):
        self.ner = StanfordNERTagger(
            'libs/english.all.3class.distsim.crf.ser.gz',
            'libs/stanford-ner-3.9.1.jar')

        path_to_jar = 'libs/stanford-corenlp-3.9.1.jar'
        path_to_models_jar = 'libs/stanford-corenlp-3.9.1-models.jar'
        self.dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        self.penalty = penalty
        self.threshold = threshold
Ejemplo n.º 30
0
def compute_NER(corpus):
    NER = []
    st = StanfordNERTagger(
        'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        'stanford-ner-2014-06-16/stanford-ner.jar')
    ner = st.tag(corpus.split())
    ner_tag = ""
    for n in ner:
        ner_tag = ner_tag + n[1] + " "
    NER.append(ner_tag)
    return NER
Ejemplo n.º 31
0
def main():

    # parse all the command line arguments
    args = parser.parse_args()
    args_is_tag = args.tag
    args_is_word = args.word

    # validate the path passed in the argument
    if not args_is_tag:
        arge.error("--tag is missing")
    else:
        tagger = StanfordNERTagger(
            '/Users/Shared/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '/Users/Shared/stanford-ner/stanford-ner.jar',
            encoding='utf-8')

        news_corpus = [
            'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.',
            "Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, and the Apple TV digital media player. Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites. Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud. Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne on April 1, 1976, to develop and sell personal computers. It was incorporated as Apple Computer, Inc. on January 3, 1977, and was renamed as Apple Inc. on January 9, 2007, to reflect its shifted focus toward consumer electronics. Apple (NASDAQ: AAPL ) joined the Dow Jones Industrial Average on March 19, 2015.",
            "At least 100 security forces killed in fight for Afghan city",
            "Sonia Gandhi, Oscar Fernandes Move High Court In National Herald IT Case",
            "Musk talking to Saudi fund, others as he seeks Tesla buyout financing",
            "Volkswagen's Electrify America taps Flintstones, Jetsons for EV campaign",
            "Netflix finance chief David Wells to step down",
            "Independent labels urge EU to block Sony's $2.3 billion bid for EMI",
            "Samsung may suspend operations at China mobile phone plant - report",
            "Oil India's quarterly profit jumps 56 percent, but misses estimate",
            "SEBI proposes changes to consent settlement rules",
            "VF to spin off Lee and Wrangler jeans into public company",
            "Erdogan vows action against 'economic terrorists' over lira plunge",
            "Citigroup says global card chief Linville leaving in shakeup",
            "Tesla short sellers trim exposure but stay the course",
            "Facebook pages with large U.S. following to require more authorization"
            "Hackers at convention test voting systems for bugs"
        ]

        news_dict = {}
        for i, each_news in enumerate(news_corpus):
            tokenized_list = word_tokenize(each_news)
            news_dict[i] = formatted_entities_for_tag(
                tagger.tag(tokenized_list))

        if args_is_word:
            search_news_key_ls = keyword_search(args_is_tag, args_is_word,
                                                news_corpus, news_dict, tagger)
        else:
            search_news_key_ls = tag_search(args_is_tag, news_dict)

        search_news_ls = []
        for each_key in search_news_key_ls:
            search_news_ls.append(news_corpus[each_key])

        news_df = pd.DataFrame({'News': search_news_ls})
        news_df.to_csv('News.csv', index=False)
Ejemplo n.º 32
0
def perpIndividual(inFile):
    st = StanfordNERTagger(
        'D:\PythonProjects\StanfordParser\stanford-ner-2017-06-09\classifiers\english.all.3class.distsim.crf.ser.gz',
        'D:\PythonProjects\StanfordParser\stanford-ner-2017-06-09\stanford-ner.jar',
        encoding='utf-8')
    text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    print(classified_text)
Ejemplo n.º 33
0
def ner_tag(url):
    text = nltk_toy(url)
    if text:
        #unicodedata.normalize('NFKD', text).encode('ascii','ignore')
        sentences = nltk.sent_tokenize(text)
        words = [nltk.word_tokenize(sentance) for sentance in sentences]
        tokens = [nltk.pos_tag(word) for word in words]
        ner = StanfordNERTagger('ner/english.muc.7class.distsim.crf.ser.gz','ner/stanford-ner.jar')
        thing = []
        for sent in re.split('\. |! |\? ',text):    # split at sentence boundaries
            tag = ner.tag(sent.split())             # tag each sentence
            for (x,i) in tag:                       # for all of the NER tags
                if i != 'O':                        # for all the tags which are not empty
                    thing.append((x,i))             # keep track
        print(thing)
        return thing
    return None
Ejemplo n.º 34
0
class NamedEntityTagger(object):
    """ Performs NER against a given document"""

    def __init__(self):
        self.tagger = StanfordNERTagger('/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/stanford_ner/stanford-ner.jar')

    def perform_ner(self, text):
        return self.tagger.tag(text)
    def named_entity_extraction(self):
        try:
            ner = StanfordNERTagger('../lib/stanford-lib/english.all.3class.distsim.crf.ser.gz',
            '../lib/stanford-lib/stanford-ner.jar')
            extracted_ne2 = ner.tag(self.metadata["plaintext"].replace(".", " ").replace(",", " , ").replace("!", " ").replace("?", " ").replace("\n"," ").split())
            extracted_ne = extracted_ne2            
        except:
            ner = NERTagger('../lib/stanford-lib/english.all.3class.distsim.crf.ser.gz',
            '../lib/stanford-lib/stanford-ner.jar')            
            extracted_ne2 = ner.tag(self.metadata["plaintext"].replace(".", " ").replace(",", " , ").replace("!", " ").replace("?", " ").replace("\n"," ").split())
            extracted_ne = extracted_ne2[0]
        
        persons = self.process_named_entities(extracted_ne, "PERSON")
        organizations = self.process_named_entities(extracted_ne, "ORGANIZATION")
        locations = self.unify_locations(extracted_ne)
        
        self.metadata["persons"] = persons
        self.metadata["organizations"] = organizations
        self.metadata["locations"] = locations

        general_locations = self.enrich_location(locations)
        self.metadata["countries"] = general_locations[0]   # a list of countries
        self.metadata["places"] = general_locations[1]      # a list of places
Ejemplo n.º 36
0
def named_entities(sentence):
    st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    tags = st.tag(word_tokenize(sentence))
    print tags
    # clean up the result from the tagger
    prev_tag_name = str(tags[0][1])
    cur_entity = str(tags[0][0])
    entities = {}
    for i in range(1, len(tags)):
        cur_tag = tags[i]
        cur_token = str(cur_tag[0])
        cur_tag_name = str(cur_tag[1])
        if cur_tag_name == prev_tag_name:
            cur_entity = cur_entity + " " + cur_token
        else:
            if not prev_tag_name in entities:
                entities[prev_tag_name] = []
            # change encoding, another way is to .encode('ascii','ignore')
            entities[prev_tag_name].append(str(cur_entity))
            cur_entity = cur_token
        prev_tag_name = cur_tag_name
    del entities['O']  # not needed, 'O' means not a named entity
    return entities
Ejemplo n.º 37
0
def main():

	file_paths = []

	for root, directories, files in os.walk("training"):
		for filename in files:
			if filename == "en.tok.off.pos":
				filepath = os.path.join(root, filename)
				file_paths.append(filepath) 

	classifier = "stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz"
	jar = "stanford-ner-2014-06-16/stanford-ner-3.4.jar"
	NERTagger = StanfordNERTagger(classifier, jar)
	for filesource in file_paths:
		words, lines = rawtext(filesource)
		NERlist = NERTagger.tag(words)
		LOClist, TAG1list = LOC_ORG_PERtagger(NERlist)
		TAG2list = CIT_COUtagger(LOClist)
		TAG3list = ANI_SPO_NAT_ENTtagger(words)
		
		ALLlist = [TAG1list,TAG2list,TAG3list]

		writeout(ALLlist, filesource, lines)
		
		NNPlist = read_data(filesource)
		#print(filesource," ",NNPlist)
		FINAL_NNPlist = nnp_checker(NNPlist)
		#print(FINAL_NNPlist,"\n")

		WIKIlinks = []
		for nnp in FINAL_NNPlist:
			ngram = nnp[-1]
			link = link_checker(ngram)
			if link != -1:
				nnp.append(link)
		wiki_writeout(FINAL_NNPlist, lines, filesource)		
Ejemplo n.º 38
0
 def __init__(self, rawQueryFile, contentFile):
     self.rawQuery = ''
     self.content = ''
     self.questionTypeWH = ["how many", "who", "what", "where", "when", "why", "which", "how"]
     self.questionTypeFactoid1 = ["do", "did", "does"]
     self.questionTypeFactoid2 = ["is", "are", "has", "have", "had", "was", "were", "would", "will", "should", "can", "could"]
     self.questionTypeOther = ["how", "list", "describe"]
     self.stopWords = ("the","a","an","am","of","by","at","be","on","or","any","in","to","as","its","it")
     self.negationWords = ("none", "not", "no", "can't", "couldn't", "don't", "won't","neither","nobody","nowhere","nothing")
     self.allTypes = ("WHType", "YesNo", "List", "None")
     self.cur_dir = os.getcwd()
     self.NERTaggerObj = StanfordNERTagger(self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/stanford-ner.jar')
     self.qWord = ""
     self.Initialize()
     self.Main()
Ejemplo n.º 39
0
class PiiAnalyzer(object):
    def __init__(self, filepath):
        self.filepath = filepath
        self.parser = CommonRegex()
        self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')

    def analysis(self):
        people = []
        organizations = []
        locations = []
        emails = []
        phone_numbers = []
        street_addresses = []
        credit_cards = []
        ips = []
        data = []

        with open(self.filepath, 'rU') as filedata:
            reader = csv.reader(filedata)

            for row in reader:
                data.extend(row)
                for text in row:
                    emails.extend(self.parser.emails(text))
                    phone_numbers.extend(self.parser.phones("".join(text.split())))
                    street_addresses.extend(self.parser.street_addresses(text))
                    credit_cards.extend(self.parser.credit_cards(text))
                    ips.extend(self.parser.ips(text))

        for title, tag in self.standford_ner.tag(set(data)):
            if tag == 'PERSON':
                people.append(title)
            if tag == 'LOCATION':
                locations.append(title)
            if tag == 'ORGANIZATION':
                organizations.append(title)

        return {'people': people, 'locations': locations, 'organizations': organizations,
                'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses,
                'credit_cards': credit_cards, 'ips': ips
                }
Ejemplo n.º 40
0
#the path where you have downloaded and unziped the ner parser.
sp_dir = '/home/sarah/nertagger/'
model1 = sp_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
model2 = sp_dir + 'classifiers/english.conll.4class.distsim.crf.ser.gz'
model3 = sp_dir + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
jar_path = sp_dir + 'stanford-ner.jar'

#our test sentence

eng_sent = 'Rami Eid has been studying at Stony Brook University in NY since 2007. He pays $30 daily'
print eng_sent
eng_tokens = word_tokenize(eng_sent)
#for 3 classes-Location, Person, Organization
print "\n\n 3 classes"
st_3 = StanfordNERTagger(model_filename = model1, path_to_jar = jar_path)
eng_tagged = st_3.tag(eng_tokens)
for i in eng_tagged:
	print i
#for 3 classes-Location, Person, Organization, Misc
print "\n\n 4 classes"
st_4 = StanfordNERTagger(model_filename = model2, path_to_jar = jar_path)
eng_tagged = st_4.tag(eng_tokens)
for i in eng_tagged:
	print i

#for 7 classes-Time, Location, Organization, Person, Money, Percent, Date 
print "\n\n 7 classes"
st_7 = StanfordNERTagger(model_filename = model3, path_to_jar = jar_path)
eng_tagged = st_7.tag(eng_tokens)
for i in eng_tagged:
Ejemplo n.º 41
0
def ner_tag():
    """Returns a noun with a tag if the tag is person or organization"""
    sttag = StanfordNERTagger('stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar')
    sttags = sttag.tag(get_nouns())
    return [sttag for sttag in sttags if sttag[1] == "PERSON" or sttag[1] == "ORGANIZATION"]
Ejemplo n.º 42
0
class AnsweringModule:
    def __init__(self, rawQueryFile, contentFile):
        self.rawQuery = ''
        self.content = ''
        self.questionTypeWH = ["how many", "who", "what", "where", "when", "why", "which", "how"]
        self.questionTypeFactoid1 = ["do", "did", "does"]
        self.questionTypeFactoid2 = ["is", "are", "has", "have", "had", "was", "were", "would", "will", "should", "can", "could"]
        self.questionTypeOther = ["how", "list", "describe"]
        self.stopWords = ("the","a","an","am","of","by","at","be","on","or","any","in","to","as","its","it")
        self.negationWords = ("none", "not", "no", "can't", "couldn't", "don't", "won't","neither","nobody","nowhere","nothing")
        self.allTypes = ("WHType", "YesNo", "List", "None")
        self.cur_dir = os.getcwd()
        self.NERTaggerObj = StanfordNERTagger(self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/stanford-ner.jar')
        self.qWord = ""
        self.Initialize()
        self.Main()

    def Initialize(self):
        # STEP 0: Convert HTML to Raw text using Beautiful Soup
        self.rawQuery = convertHTMLtoRawText(rawQueryFile)
        self.content = convertHTMLtoRawText(contentFile)
        # Replace EOL character
        self.rawQuery = self.rawQuery.strip().replace("?","").replace('"','').replace(".", " ")
        self.campusLocations = {}

    # Replaces the stopwords and the qWord from the question                    
    def ReplaceStopQuestionWords(self, question):
        for stopWord in self.stopWords:
            pattern = r'\b%s\b'% stopWord
            question= re.sub(pattern,"",question,flags=re.IGNORECASE)

        pattern = r'\b%s\b' % self.qWord
        questionWithOutJunk = re.sub(pattern,"",question,flags = re.IGNORECASE)
        
        pattern = r'?$'
        questionWithOutJunk  = questionWithOutJunk.replace('?','')
        
        return questionWithOutJunk

    # Given the input question, this method returns the qWord and the type of question
    def DefineQuestionType(self, question):
        questionLC = question.lower()
        # Check first word in sentence
        wordsInSentence = questionLC.split()
        if wordsInSentence[0] in self.questionTypeWH:
            # Check for How Many type question
            if wordsInSentence[0]+" "+wordsInSentence[1] in self.questionTypeWH:
                return self.allTypes[0], "how many"
            # All other WH Questions
            else:
                return self.allTypes[0], wordsInSentence[0]

        elif wordsInSentence[0] in self.questionTypeFactoid2:
            return self.allTypes[1], wordsInSentence[0]

        elif wordsInSentence[0] in self.questionTypeFactoid1:
            return self.allTypes[1], wordsInSentence[0]

        elif wordsInSentence[0] in self.questionTypeOther:
            return self.allTypes[2], wordsInSentence[0]
        
        else:
            # For complex sentences, check for question words after comma
            if "," in questionLC:
                wordsInSentence = questionLC.split(",")[1].split()
                if wordsInSentence[0] in self.questionTypeWH:
                    # Check for How many
                    if wordsInSentence[0]+" "+wordsInSentence[1] in self.questionTypeWH:
                        return self.allTypes[0], "how many"
                    # All other WH Questions
                    else:
                        return self.allTypes[0], wordsInSentence[0]

                elif wordsInSentence[0] in self.questionTypeFactoid2:
                    return self.allTypes[1], wordsInSentence[0]

                elif wordsInSentence[0] in self.questionTypeFactoid1:
                    return self.allTypes[1], wordsInSentence[0]

                elif wordsInSentence[0] in self.questionTypeOther:
                    return self.allTypes[2], wordsInSentence[0]
                else:
                    return self.allTypes[3], ""
            #As a last resort, look for question word in the entire question sentence
            #We ignore edge cases where there are multiple question words
            else: 
                for q in self.questionTypeWH: 
                    if q in questionLC: return self.allTypes[0], q
                for q in self.questionTypeFactoid2: 
                    if q in questionLC: return self.allTypes[1], q
                for q in self.questionTypeFactoid1: 
                    if q in questionLC: return self.allTypes[1], q
                for q in self.questionTypeOther: 
                    if q in questionLC: return self.allTypes[2], q
                return self.allTypes[3], ""

    # Given a sentence, checks whether it contains any GHC locations
    def CheckForWHEREAnswer(self, sentence):
        for token in sentence:
            if token.lower() in self.campusLocations:
                return True
        sentTokens = nltk.word_tokenize(sentence)
        NERtags = self.NERTaggerObj.tag(sentTokens)
        NERtags = chunkNEROutput(NERtags)
        countOfOccurence = 0
        for i in xrange(0,len(NERtags)):
            if 'LOCATION' in NERtags[i]:
                ans, tag = NERtags[i]
                if ans in sentence:
                    countOfOccurence += 1
                    continue
                return True
        if countOfOccurence > 1:
            return True
        return None

    # Given a sentence, checks whether it contains time stamps
    def CheckForWHENAnswer(self, sentence):
        timeStamp = getTimeStamp(sentence)
        if timeStamp is not None:
            return True
        # Covered Edge case for "second century AD"
        for timeEvent in {"AD", "BCE", "BC"}:
            pattern = r'[^a-zA-Z]%s[^a-zA-Z]' % timeEvent
            if re.search(pattern, sentence) is not None:
                return True
        return None


    # Given the content and question, this method extracts the matching sentences
    def CheckForMatch(self, content, question):
        result = ''

        # Tokenize the question
        queryTokens, queryPosTags, queryMorphTokens = tokenize(question)
        lengthOfQuery = len(queryMorphTokens)
        
        # Split the entire content into Sentences
        contentSentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\n)\s', content)

        for idx, sentence_level1 in enumerate(contentSentences):
            for sentence in sentence_level1.split("\n"):
                sentence = sentence.replace("<p>","").replace("</p>","").replace(".","")
                sentence_processed = sentence
                sentence_processed = sentence_processed.replace(". ","").replace("<a href"," <a href")
                sentTokens, sentPosTags, sentMorphTokens = tokenize(sentence_processed)
                counter = 0.0
                for word in queryMorphTokens:
                    # Get list of synonyms of the query word
                    synonyms = synonyms_as_set(word, queryTokens[queryMorphTokens.index(word)])

                    # For each word in the query, we check if the word occurs in the sentence
                    if word in sentMorphTokens:
                        counter+=1.0
                    else: # OR check if the word is a synonym of a word in the sentence i.e. one of the synonyms exist in the sentence
                        for syn in synonyms:
                            if syn in sentMorphTokens:
                                counter+=1.0
                                break

                matchPercent = float(counter/lengthOfQuery)
                if matchPercent >= 0.4:
                    # ADDITIONAL CHECKS:
                    # WHERE TYPE QUESTIONS SHOULD HAVE LOCATION IN SENTENCE:
                    if self.qWord == "where":
                        if self.CheckForWHEREAnswer(sentence_processed) == None:
                            # Check for neighboring sentences to extract the location
                            if (idx+1) <= len(contentSentences)-1:
                                fwdSentence = contentSentences[idx+1]
                                if self.CheckForWHEREAnswer(fwdSentence) == True:
                                    sentence = sentence + "\n" + fwdSentence
                            if (idx-1) >= 0:
                                bckSentence = contentSentences[idx-1]
                                if self.CheckForWHEREAnswer(bckSentence) == True:
                                    sentence = bckSentence + "\n" + sentence

                    # WHEN TYPE QUESTIONS SHOULD HAVE TIME IN SENTENCE:
                    if self.qWord == "when":
                        if self.CheckForWHENAnswer(sentence_processed) == None:
                            # If the current sentence doesn't have a date. We check for previous sentence
                            if (idx+1) <= len(contentSentences)-1:
                                fwdSentence = contentSentences[idx+1]
                                if self.CheckForWHENAnswer(fwdSentence) == True:
                                    sentence = sentence + "\n" + fwdSentence
                            if (idx-1) >= 0:
                                bckSentence = contentSentences[idx-1]
                                if self.CheckForWHENAnswer(bckSentence) == True:
                                    sentence = bckSentence + "\n" + sentence

                    result = result + sentence + "\n"
        print result

    def Main(self):
        # STEP 1: Define Question Type: 
        qType, self.qWord = self.DefineQuestionType(self.rawQuery)

        if self.qWord == "where":
            self.campusLocations = getCampusLocation(self.cur_dir+"/PythonScripts/gazeteer/campusLocations.txt")

        # STEP 2: Remove stop words and junk from the question
        questionWithOutJunk = self.ReplaceStopQuestionWords(self.rawQuery)

        # STEP 3:
        self.CheckForMatch(self.content, questionWithOutJunk)
Ejemplo n.º 43
0
import wikipedia as wiki
from bs4 import BeautifulSoup
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
from unidecode import unidecode
from datetime import datetime

from indexers import TextCollection, TextGroup

# Path to java
jpath2 = 'C:/Program Files (x86)/Java/jre1.8.0_73/bin'
# Setting Java environment
os.environ['JAVAHOME'] = jpath2

# initializing stanford NER
st = StanfordNERTagger('C:\stanford-ner-2014-06-16\classifiers\english.muc.7class.distsim.crf.ser.gz',
                       'C:\stanford-ner-2014-06-16\stanford-ner.jar', encoding='UTF-8')


def clean(in_txt: str) -> str:
    in_txt = re.sub("/.*/; ", "", in_txt)
    in_txt = re.sub("–", "-", in_txt)
    in_txt = re.sub(r"\\", "", in_txt)
    in_txt = re.sub("Â\xa0", " ", in_txt)
    in_txt = unidecode(in_txt)
    indices = [m.start() for m in re.finditer('(\d{4}(-|/)\d{2})', in_txt)]
    for i in reversed(indices):
        in_txt = in_txt[0:(i + 4)] + " to " + in_txt[i:i + 2] + in_txt[i + 5:]

    indices = [m.start() for m in re.finditer(r'\d{4}(-|/)\d{4}', in_txt)]
    for i in reversed(indices):
        in_txt = in_txt[:i + 4] + ' to ' + in_txt[i + 5:]
Ejemplo n.º 44
0
def entitiy_rec(text):
    ner = StanfordNERTagger("ner/classifiers/english.all.3class.distsim.crf.ser.gz", "ner/stanford-ner.jar")
    tags=ner.tag(text)
    return tags
	def process (self, parameters={}, data={} ):
	
		verbose = False
		if 'verbose' in parameters:
			if parameters['verbose']:	
				verbose = True


		if 'stanford_ner_mapping' in parameters:
			mapping = parameters['stanford_ner_mapping']
		else:
			# todo: extend mapping for models with more classes like dates
			mapping = {
			 'PERSON': 'person_ss',
			 'LOCATION': 'location_ss',
			 'ORGANIZATION': 'organization_ss',
			 'I-ORG': 'organization_ss',
			 'I-PER': 'person_ss',
			 'I-LOC': 'location_ss',
			 'ORG': 'organization_ss',
			 'PER': 'person_ss',
			 'LOC': 'location_ss',
			 'PERS': 'person_ss',
			 'LUG': 'location_ss',
			 'MONEY': 'money_ss',
			}
	

		# default classifier
		classifier = 'english.all.3class.distsim.crf.ser.gz'

		if 'stanford_ner_classifier_default' in parameters:
			classifier = parameters['stanford_ner_classifier_default']

		# set language specific classifier, if configured and document language detected
		if 'stanford_ner_classifiers' in parameters and 'language_s' in data:
			# is a language speciic cassifier there for the detected language?
			if data['language_s'] in parameters['stanford_ner_classifiers']:
				classifier = parameters['stanford_ner_classifiers'][data['language_s']]

		# if standard classifier configured to None and no classifier for detected language, exit the plugin
		if not classifier:
			return parameters, data

		kwargs={}

		if 'stanford_ner_java_options' in parameters:
			kwargs['java_options'] = parameters['stanford_ner_java_options']

		if 'stanford_ner_path_to_jar' in parameters:
			kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar']

		analyse_fields = ['title_txt','content_txt','description_txt','ocr_t','ocr_descew_t']

		text = ''
		for field in analyse_fields:
			if field in data:
				text = "{}{}\n".format(text, data[field])

		# classify/tag with class each word of the content
		st = StanfordNERTagger(classifier, encoding='utf8', verbose=verbose, **kwargs)
		entities = st.tag(text.split())

		# compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)
		entities = self.multi_word_entities(entities)

		# if class of entity is mapped to a facet/field, append the entity to this facet/field
		for entity, entity_class in entities:

			if entity_class in mapping:
				
				if verbose:
					print ( "NER classified word(s)/name {} to {}. Appending to mapped facet {}".format(entity, entity_class, mapping[entity_class]) )

				etl.append(data, mapping[entity_class], entity)

			else:
				if verbose:
					print ( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}".format(entity_class, entity) )


		# mark the document, that it was analyzed by this plugin yet
		data['enhance_ner_stanford_b'] = "true"
		
		return parameters, data
class Evaluator:
    def __init__(self, cslm, transitions, tags):
        self.cslm = cslm
        self.transitions = transitions
        self.tags = tags
        self.engClassifier = StanfordNERTagger(
            "../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz",
            "../stanford-ner-2015-04-20/stanford-ner.jar")
        self.spanClassifier = StanfordNERTagger(
            "../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz",
            "../stanford-ner-2015-04-20/stanford-ner.jar")

    def tagger(self, text_list):
        hmm = HiddenMarkovModel(text_list, self.tags, self.transitions, self.cslm)
        hmmtags = hmm.generateTags() # generate list of hmm tags
        words = hmm.words # generate list of words
        taggedTokens = []
        prevLang = "Eng"
        engTags = []
        spnTags = []
        engTag = ""
        spanTag = ""
        token = re.compile(ur'[^\w\s]', re.UNICODE)
        print "Tagging {} words".format(len(words))
        for k, word in enumerate(words):
            # check if punctuation else use hmmtag
            lang = 'Punct' if re.match(token, word) and not word[-1].isalpha() else hmmtags[k]
            lang = 'Num' if word.isdigit() else lang
            # check if word is NE
            if lang != "Punct":
              index = k % 1000
              if index == 0:
                engTags = self.engClassifier.tag(words[k:k+1000])
                spnTags = self.spanClassifier.tag(words[k:k+1000])
              engTag = engTags[index][1]
              spanTag = spnTags[index][1]
            else:
              engTag = "O"
              spanTag = "O"

            # mark as NE if either classifier identifies it
            if engTag != 'O' or spanTag != 'O':
                NE = "{}/{}".format(engTag, spanTag)
            else:
                NE = "O"
            # record probabilities
            if lang in ("Eng", "Spn"):
              hmmProb = round(hmm.transitions[prevLang][lang], 2)
              engProb = round(self.cslm.prob("Eng", word), 2)
              spnProb = round(self.cslm.prob("Spn", word), 2)
              totalProb = (hmmProb + engProb) if lang == "Eng" else (hmmProb + spnProb)
              prevLang = lang
            else:
              hmmProb = "N/A"
              engProb = "N/A"
              spnProb = "N/A"
              totalProb = "N/A"

            taggedTokens.append((word, lang, NE, str(engProb), str(spnProb), str(hmmProb), str(totalProb)))
            #taggedTokens.append((word, lang, NE))
            #print word, lang, NE
        return taggedTokens

    #  Tag testCorpus and write to output file
    def annotate(self, testCorpus):
        print "Annotation Mode"
        with io.open(testCorpus.strip(".txt") + '_annotated.txt', 'w', encoding='utf8') as output:
            text = io.open(testCorpus).read()
            testWords = toWordsCaseSen(text)
            tagged_rows = self.tagger(testWords)
            output.write(u"Token\tLanguage\tNamed Entity\tEng-NGram Prob\tSpn-NGram Prob\tHMM Prob\tTotal Prob\n")
            for row in tagged_rows:
                csv_row = '\t'.join([unicode(s) for s in row]) + u"\n"
                print csv_row
                output.write(csv_row)
            print "Annotation file written"

    #  Evaluate goldStandard and write to output file
    def evaluate(self, goldStandard):
        print "Evaluation Mode"
        with io.open(goldStandard + '_outputwithHMM.txt', 'w', encoding='utf8') as output:

            #create list of text and tags
            lines = io.open(goldStandard, 'r', encoding='utf8').readlines()
            text, gold_tags = [], []
            for x in lines:
                columns = x.split("\t")
                text.append(columns[-2].strip())
                gold_tags.append(columns[-1].strip())


            # annotate text with model
            annotated_output = self.tagger(text)
            #tokens, lang_tags, NE_tags = map(list, zip(*annotated_output))
            tokens, lang_tags, NE_tags, engProbs, spnProbs, hmmProbs, totalProbs = map(list, zip(*annotated_output))

            # set counters to 0
            langCorrect = langTotal = NECorrect = NETotal = 0
            evaluations = []

            # compare gold standard and model tags
            for lang, NE, gold in zip(lang_tags, NE_tags, gold_tags):
                if gold in ('Eng', 'Spn'):   #evaluate language tags
                    langTotal += 1
                    if gold == lang:
                        langCorrect += 1
                        evaluations.append("Correct")
                    else:
                        evaluations.append("Incorrect")
                # evaluate NE tags
                elif gold == "NamedEnt":
                    NETotal += 1
                    if NE != 'O':
                        NECorrect += 1
                        evaluations.append("Correct")
                    else:
                        evaluations.append("Incorrect")
                # don't evaluate punctuation
                else:
                    evaluations.append("NA")
            #write
            output.write(u"Language Accuracy: {}\n".format(langCorrect / float(langTotal)))
            output.write(u"NE Accuracy: {}\n".format(NECorrect / float(NETotal)))
            output.write(u"Token\tGold Standard\tTagged Language\tNamed Entity\tEvaluation\n")
            for all_columns in zip(text, gold_tags, lang_tags, NE_tags, evaluations):
                output.write(u"\t".join(all_columns) + u"\n")
            print "Evaluation file written"
Ejemplo n.º 47
0
 def __init__(self):
     self.tagger = StanfordNERTagger('/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/stanford_ner/stanford-ner.jar')
Ejemplo n.º 48
0
def ner(datasetfile, format, language):

    tweets = ""
    tweetids = []
    if language == 'english':
        st = StanfordNERTagger(BASEPATH+'/classifiers/english.all.3class.distsim.crf.ser.gz', BASEPATH+'/classifiers/stanford-ner.jar', encoding='utf8')
    elif language == 'spanish':
        st = StanfordNERTagger(BASEPATH+'/classifiers/spanish.ancora.distsim.s512.crf.ser.gz', BASEPATH+'/classifiers/stanford-ner.jar', encoding='utf8')

    if format == 'xml':

        dataset = etree.parse(datasetfile)
        for tweet in dataset.xpath('//Tweet'):
            tweetText = tweet.xpath('./TweetText/text()')[0]
            tweets += ' '.join(re.findall(r"[\w:/!#$%&*+,\-:;?@^_`{|}~.]+|[\"'()[\]<=>]", tweetText))+"\n"
            tweetids.append(tweet.xpath('./TweetId/text()')[0])

        tweets = tweets.encode('utf-8')

    elif format == "nif":

        tweetdict = {}
        a = rdflib.Graph()
        a.parse(datasetfile, format='n3')

        for s, p, o in a:
            if s.endswith(',') and p.endswith('isString'):
                tweetid = s.split('#')[0].split('.xml/')[1]
                tweetdict[tweetid] = ' '.join(re.findall(r"[\w:/!#$%&*+,\-:;?@^_`{|}~.]+|[\"'()[\]<=>]", o))

        for key in sorted(tweetdict):
            tweetids.append(key)
            tweets += tweetdict[key]+'\n'
        tweets = tweets.encode('utf-8')
        #print tweets

    elif format == "text":
        tweets = datasetfile

    tweetlist = []
    for t in tweets.splitlines():
        newtweet = []
        for word in t.split():
            newword = u''
            if word.endswith(",") or word.endswith(".") or word.endswith(")") or word.endswith("\'"):
                newtweet.append(word[:-1])
                newtweet.append(word[-1])
            else:
                newtweet.append(word)
        #print newtweet
        tweetlist.append(newtweet)


    results = ''
    tagged = []

    for tweet in tweetlist:
        tagged.append(st.tag(tweet))
        #print tagged[-1]
    #print len(tagged)

    inEntity = False
    for line in tagged:
        #print line
        for (word, entity) in line:
            if entity != 'O' and inEntity:
                entity = 'I-'+entity
            elif entity != 'O' and inEntity == False:
                entity = 'B-'+entity
                inEntity = True
            else:
                inEntity = False
            results += word + '/' + entity + ' '
    if tweetids:
        results += "||"+tweetids[x]
    results += "\n"

    #print results
    return results
Ejemplo n.º 49
0
from __future__ import print_function

from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
import nltk
import sys
import os

tagger = StanfordNERTagger('stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz',
                           path_to_jar='stanford-ner-2017-06-09/stanford-ner.jar')

print(sys.argv[1])

with open(sys.argv[1]) as fin:
    current_entity = []
    entities = []
    for line in fin:
        for token,tag in tagger.tag(word_tokenize(line)):
            if tag != 'O':
                current_entity.append((token,tag))
            else:
                if current_entity != []:
                    entities.append(current_entity)
                    current_entity = []
    if current_entity != []:
        entities.append(current_entity)

with open(os.path.splitext(sys.argv[1])[0]+'.ne','w') as fout:
    for entity in entities:
        print('%s_%s'%(' '.join([tok for tok,tag in entity]),entity[0][1]),file=fout)
Ejemplo n.º 50
0
#!/bin/env python3.5
from nltk.tag.stanford import StanfordNERTagger
from nltk.internals import find_jars_within_path
from nltk.tokenize import sent_tokenize
import os

tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar')
tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09'))
print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
Ejemplo n.º 51
0
def main():
    pathname = "test/*/*/*.pos"
    directory = glob.glob(pathname)
    for f in directory:
        with open(f) as readfile:
            print(f)
            print("Processing...")
            result = []
            history = []
            chunk = False

            # Collect words
            words = [line.split()[3] for line in readfile if len(line.split()) > 1]

            # NER tag using Stanford
            stanford = StanfordNERTagger('stanford-ner-2014-06-16/classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar')
            tagged = stanford.tag(words)

            for word_tuple in tagged:
                word = word_tuple[0]
                tag = word_tuple[1]
                # Determine new tag
                new_tag = transform_tag(tag, word, words)

                # Determine chunk
                if len(history) > 0 and history[-1][1] != new_tag:
                    if len(history) > 1 and history[-1][1]:
                        chunk = " ".join([tpl[0] for tpl in history])
                    history = []
                new_word_tuple = (word, new_tag)
                history.append(new_word_tuple)

                # Search wikipedia page
                # Process chunk
                if chunk:
                    wikiurl = wiki_search(chunk)

                    chunk_length = len(chunk.split())
                    old_combis = result[-chunk_length:]
                    result = result[:-chunk_length]
                    for old_combi in old_combis:
                        new_combi = old_combi[:-1]
                        new_combi.append(wikiurl)
                        result.append(new_combi)

                    chunk = False

                # Process word
                if new_tag:
                    wikiurl = wiki_search(word)
                else:
                    wikiurl = ""

                result.append([word, new_tag, wikiurl])

        # Write results to .ent.aut file
        with open(f) as readfile2, open(f + ".ent.aut", "a") as writefile:
            print("Writing...")
            n = 0
            for line in readfile2:
                if len(line) > 1:
                    new_line = line.rstrip() + " " + result[n][1] + " " + result[n][2]
                    print(new_line, file=writefile)
                n += 1
Ejemplo n.º 52
0
import os
from nltk.tag.stanford import StanfordNERTagger
# java_path = "C:/Program Files/Java/jdk1.8.0_05/bin/java.exe"
# os.environ['JAVAHOME'] = java_path

# path2 = 'C:/Users/Pantelis/Desktop/stanford-ner'
st =  StanfordNERTagger('classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar')

f1 = open('Dostoyevski_TheGambler.txt','r')
f2 = open('Dostoyevsky_TheGambler_Results.txt','w')
f3 = open('Dostoyevsky_TheGambler_PERSONS.txt','w')
f4 = open('Dostoyevsky_TheGambler_Unique_PERSONS.txt','w')

book=f1.read()
persons =[]
#print book
results2= st.tag(book.split())

for name,entity in results2:
    print name +" " + entity
    f2.write(name +" " + entity+"\n" )
    if entity == "PERSON":
        f3.write(name +"\n")
        if name not in persons:
            persons.append(name)
for k in persons:
    f4.write(k+"\n")
Ejemplo n.º 53
0
def nertag(text):
    st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
    print(st.tag(text.split()))
Ejemplo n.º 54
0
def o_tag():
    """Returns a noun with a tag if the tag is unfindable or a location"""
    sttag = StanfordNERTagger('stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar')
    sttags = sttag.tag(get_nouns())
    return [sttag for sttag in sttags if sttag[1] == 'O' or sttag[1] == "LOCATION"]
'''
Created on Apr 12, 2016

@author: zhongzhu
'''
from nltk.parse.stanford import StanfordParser
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize

import script_wrapper as stanford_parser 


sentence = "Dempsey was drafted by Major League Soccer club New England Revolution."
st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
tags = st.tag(word_tokenize(sentence))
print(tags)

prev_tag_name = tags[0][1]
cur_entity = tags[0][0]
entities = {}
for i in range(1, len(tags)):
    cur_tag = tags[i]
    cur_token = cur_tag[0]
    cur_tag_name = cur_tag[1]
    if cur_tag_name == prev_tag_name:
        cur_entity = cur_entity + " " + cur_token
    else:
        if not prev_tag_name in entities:
            entities[prev_tag_name] = []
        entities[prev_tag_name].append(cur_entity)
        cur_entity = cur_token
Ejemplo n.º 56
0
 def __init__(self, filepath):
     self.filepath = filepath
     self.parser = CommonRegex()
     self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')
def stanfordNERExtractor(sentence):
    st = StanfordNERTagger(baseLocation + 'english.muc.7class.distsim.crf.ser.gz', baseLocation + 'stanford-ner.jar')
    return st.tag(sentence.split())
Ejemplo n.º 58
0
class StanfordNERTaggerExtractor(object):
    """docstring for ClassName"""
    def __init__(self):
        self.st = StanfordNERTagger('intent_class_models/stanford-jars/english.all.3class.distsim.crf.ser.gz' ,
            "intent_class_models/stanford-jars/stanford-ner.jar" )
        # self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz' ,
        #     'stanford-ner.jar' )

    def tag_text_single(self,text):
        '''
        :param text:
        :return:
        '''
        # assert type(text) == str
        sents = self.st.tag(nltk.word_tokenize(text))
        return sents

    def identify_NER_tags_single(self,text_tag,tag_to_find):
        '''
        :param text_tag: Tagged text
        :param tag_to_find:
        :return:
        '''
        tag_strs = []
        prev_wrd_tag = False
        for wrd,tag in text_tag:
            if tag == tag_to_find:
                if not prev_wrd_tag:
                    tag_strs.append(wrd)
                else:
                    prev_wrd = tag_strs.pop()
                    new_wrd = prev_wrd+' '+wrd
                    tag_strs.append(new_wrd)
                prev_wrd_tag = True
            else:
                prev_wrd_tag = False
        tags_final = []
        for wrd in tag_strs:
            if wrd not in tags_final:
                tags_final.append(wrd)
        return tags_final

    def tag_text_multi(self,text):
        ''' '''
        tokenized_sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
        return self.st.tag_sents(tokenized_sents)

    def identify_NER_tags_multi(self,text_tag,tag_to_find):
        ''' '''
        tag_strs = []
        for sent_tag in text_tag:
            for wrd in self.identify_NER_tags_single(sent_tag,tag_to_find):
                if wrd not in tag_strs:
                    tag_strs.append(wrd)
        return tag_strs

    def tag_text_multi_from_single(self,ner_tags):
        ''' converting a huge single text tags into sentence based tags
        this is done because tagging sentence wise is slow. so we tag the entire text
        and split them after'''
        sents = ''
        for wrd,_ in ner_tags:
            sents += wrd+' '
        sent_tags = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(sents)]
        cnt = 0
        final_tags = []
        for sent_ind in range(len(sent_tags)):
            sent_tag_list = []
            for wrd_ind in range(len(sent_tags[sent_ind])):
                try:
                    sent_tag_list.append(ner_tags[cnt])
                    cnt += 1
                except:
                    break
            final_tags.append(sent_tag_list)
        return final_tags
Ejemplo n.º 59
0
#!/usr/bin/python
from nltk.tag.stanford import StanfordNERTagger
import operator
import re

fileList = open("fileList.txt", "r")
fileName = fileList.readlines()
fileList.close()

outfile = open("country.txt", "w")

english_nertagger = StanfordNERTagger('/Users/stellamberv/Documents/stanford-ner-2014-08-27/classifiers/english.muc.7class.distsim.crf.ser.gz','/Users/stellamberv/Documents/stanford-ner-2014-08-27/stanford-ner.jar')

for i in range(len(fileName)): 
  oneFile = open(fileName[i].rstrip(), "r")
  oneFileContent = oneFile.read()
  oneFile.close()

  str_split = english_nertagger.tag(oneFileContent.split())
  
  j = 0
  country = ""
  while j < len(str_split):
    if str_split[j][1] == u'LOCATION':
      country = country + " " + (str_split[j][0]).encode("utf-8")
      j = j + 1
    else:
      j = j + 1
      if len(country) == 0:
        continue
      else:
Ejemplo n.º 60
0
 def __init__(self):
     self.st = StanfordNERTagger('intent_class_models/stanford-jars/english.all.3class.distsim.crf.ser.gz' ,
         "intent_class_models/stanford-jars/stanford-ner.jar" )