def __init__(self, cslm, transitions, tags):
     self.cslm = cslm
     self.transitions = transitions
     self.tags = tags
     self.engClassifier = StanfordNERTagger(
         "../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz",
         "../stanford-ner-2015-04-20/stanford-ner.jar")
     self.spanClassifier = StanfordNERTagger(
         "../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz",
         "../stanford-ner-2015-04-20/stanford-ner.jar")
    def __init__(self, classifier_path=None, ner_path = None, sutime_jar_path = None):
        # Change the path according to your system
        if classifier_path is None:
            classifier_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.muc.7class.distsim.crf.ser.gz"

        if ner_path is None:
            ner_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\stanford-ner.jar"

        if sutime_jar_path is None:
            sutime_jar_path = "C:\stanford_corenlp\stanford-corenlp-full-2018-02-27\stanford-corenlp-full-2018-02-27"

        self.stanford_classifier = classifier_path
        self.stanford_ner_path = ner_path
        self.sutime_path = sutime_jar_path

        # Creating Tagger Object
        self.st = StanfordNERTagger(self.stanford_classifier, self.stanford_ner_path)
        self.su = SUTime(jars=self.sutime_path, mark_time_ranges=True, include_range=True)

        self.weather_terms = ["weather", "climate", "precipitation", "sun", "rain", "cloud","snow", "hot", "humid", "cold", "sunny", "windy","cloudy",
                              "rainy", "snowy", "misty", "foggy", "colder","hotter", "warmer", "pleasant"]
        self.greet_terms= ["hello","hey","howdy","hello","hi", "yo", "yaw"]
        self.closure_terms = ["no", "nope", "thank you", "bye", "tata", "thanks", "that will be all", "that's it", "that'll be all"]
        self.day_terms = ["dawn", "dusk", "morning", "evening", "noon","afternoon", "night", "tonight", "midnight", "midday"] #, "hours"]
        self.date_terms = ["today", "tomorrow", "yesterday"]
Beispiel #3
0
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file,
                        output_file):
    jarfile = stanford_dir + jarfile
    modelfile = stanford_dir + modelfile

    #st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
    stanford_classifier = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
    stanford_ner_path = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\stanford-ner.jar'

    # Creating Tagger Object
    st = StanfordNERTagger(stanford_classifier,
                           stanford_ner_path,
                           encoding='utf-8')

    i = 0
    tagged_ne = []
    with open(tag_this_file, "r") as f:
        for line in f:
            line = line.split()
            i += 1
            if len(line) > 0:
                tagged_ne.append(line[0])
            else:
                # Remove the SENENDs from the output file afterwards. Needed to keep the format consistent
                # Keep in mind, that some "/" are still removed. Is replace in postprecessing step.
                tagged_ne.append("SENEND")
    print(tagged_ne)

    # Tag the file using Stanford NER
    out = st.tag(tagged_ne)

    # Write the results to a tsv file
    with open(output_file, "w") as f:
        for i in out:
            f.write(str(i[0]) + "\t" + i[1] + "\n")
Beispiel #4
0
    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        tags = [nt for nt in NERTags]
        ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"]
               for t in tags]

        phrases = []
        for i, t in zip(ids, tags):
            phrase = ""
            tt = "N/A"
            for p, index in zip(i, range(len(i))):
                if index == len(i) - 1:
                    phrase += "{}".format(t[p][0])
                    tt = phrase, t[p][1]
                else:
                    phrase += "{} ".format(t[p][0])

            phrases.append(tt)
        return phrases
Beispiel #5
0
    def ner_tagger(self):
        if self._ner_tagger is None:
            jar = osp.join(osp.dirname(__file__), 'model', 'stanford-ner.jar')
            model = osp.join(osp.dirname(__file__), 'model', 'english.all.3class.distsim.crf.ser.gz')
            self._ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

        return self._ner_tagger
Beispiel #6
0
    def __init__(self, articleText):
        self.blob = textblob.TextBlob(articleText)
        #blob.tags
        keywords = [
            x[0] for x in self.blob.tags
            if "NNP" in x[1] or "NN" in x[1] or "CD" in x[1]
        ]
        self.keywords = set(keywords)
        self.nounPhrases = Counter(self.blob.noun_phrases).most_common()
        st = StanfordNERTagger(
            'D:/Source/Newspeek/Newspeek/news/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz',
            'D:/Source/Newspeek/Newspeek/news/stanford-ner-2014-06-16/stanford-corenlp-caseless-2015-04-20-models.jar'
        )
        self.namedEntities = dict((a.lower(), b) for a, b in set(
            [x for x in st.tag(articleText.split()) if x[1] != 'O']))
        self.namedPhrases = {}

        for np in self.nounPhrases:
            tags = []
            for word in np[0].split():
                tag = 'O'
                if word.lower() in self.namedEntities.keys():
                    tag = self.namedEntities[word.lower()]
                tags.append(tag)

            np_tag = Counter(tags).most_common(1)[0][0]
            if np_tag != 'O':
                self.namedPhrases[np[0].lower()] = np_tag

        pass
    def ner_tagger(self):
        if self._ner_tagger is None:
            jar = 'stanford-ner.jar'
            model = 'english.all.3class.distsim.crf.ser.gz'
            self._ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

        return self._ner_tagger
def stanfordNer():
    f = open("federer.txt", "r")
    text = f.read()

    jar = './stanford-ner-tagger/stanford-ner.jar'
    model = './stanford-ner-tagger/english.muc.7class.distsim.crf.ser.gz'

    ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
    words = nltk.word_tokenize(text)
    color_print('\nStanfordNER', color='blue', bold=True, underline=True)
    color_print('\nPrepoznati entiteti:\n',
                color='yellow',
                bold=True,
                underline=True)

    table = PrettyTable(["Prepoznati entitet", "Tip entiteta"])

    for token, tag in ner_tagger.tag(words):
        if tag != "O":
            table.add_row([token, tag])
    print(table)

    with open("Rezultati_StanfordNER.txt", "w", encoding="utf8") as text_file:
        text_file.write("Prepoznati entiteti: \n\n%s" % table)

    print("Rezultati sačuvani u fajl Rezultati_StanfordNER.txt")
def tagSent(sent):
    words=nltk.word_tokenize(sent)
    pos_tags =nltk.pos_tag(words)
    EntityKey = []
    Entities = {}
    EntityName = ""
    length = len(pos_tags)
    NounTag = ["NN","NNS","CD"]
    for i in range(length):
        if pos_tags[i][1] == 'NNP':
            if i >0 and pos_tags[i - 1][1] == 'NNP':
                EntityName += '_'
            EntityName += pos_tags[i][0]
            if i == length-1 or (i < length-1 and pos_tags[i + 1][1] != 'NNP'):
                Entities[EntityName] = pos_tags[i][1]
                EntityName = ""
        if pos_tags[i][1] in NounTag:
            EntityName += pos_tags[i][0]
            j=i-1
            while j>=0 and pos_tags[j][1] == "JJ":
                EntityName = pos_tags[j][0] +" "+EntityName
                j -= 1
            Entities[EntityName] = "OTHER"
            EntityName=""

    st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    NerSen = st.tag(sent.split())
    for token,type in Entities.items():
        for word,tag in NerSen:
            if (word in token) and ("PERSON" == tag):
                Entities[token] = "PERSON"
    return Entities
    def __init__(self):

        # stanford ner tagger
        from nltk.tag.stanford import StanfordNERTagger
        self.ner_stanford = StanfordNERTagger(
            '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz',
            '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar'
        )

        # stanford pos tagger
        from nltk.tag.stanford import StanfordPOSTagger
        self.pos_stanford = StanfordPOSTagger(
            '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',
            '/home/harish/Documents/softwares/running/corenlp/stanford-postagger-full-2015-04-20/stanford-postagger.jar'
        )

        # spacy ner tagger
        import spacy
        self.ner_spacy = spacy.load('en')

        # wordnet lemmatizer
        from nltk.stem.wordnet import WordNetLemmatizer
        self.lemmatizer = WordNetLemmatizer()

        self.tagged_output = {}
def getNER(sent):
    st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    NerSen = st.tag(sent.split())
    Entities = {}
    EntityName = "";
    length = len(NerSen)
    for i in range(length):
        if NerSen[i][1] != 'O':
            if i >0 and NerSen[i - 1][1] != 'O':
                EntityName += '_'
            EntityName += NerSen[i][0]
            if i == length-1 or (i < length-1 and NerSen[i + 1][1] == 'O'):
                Entities[EntityName] = NerSen[i][1]
                EntityName = ""
    words=nltk.word_tokenize(sent)
    pos_tags =nltk.pos_tag(words)
    EntityKey = []
    for key in Entities.keys(): EntityKey.append(key)
    for token,tag in pos_tags:
        if tag == "NNP":
            isContain = False
            for key in EntityKey:
                if token in key:
                    isContain = True
            if not isContain:
                Entities[token] = "NNP"
    return Entities
Beispiel #12
0
def extract_character_names(book_contents):
    data = CharacterData()

    lines = sent_tokenize(book_contents)
    tagger = StanfordNERTagger(
        "./stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz",
        "./stanford-ner/stanford-ner.jar")

    discovered_names_in_line = set()
    for line in lines:
        words = word_tokenize(line)
        taggd_token = tagger.tag(words)

        name = ""
        first_name = ""
        discovered_names_in_line.clear()
        for word, tag in taggd_token:
            if tag == "PERSON":
                if name == "":
                    first_name = word
                name += word + " "
            else:
                if name != "":
                    name = name.strip()
                    if first_name not in discovered_names_in_line:
                        data.add_line_for_name(first_name, name, line)
                        discovered_names_in_line.add(first_name)
                    name = ""
                    first_name = ""

    return data
Beispiel #13
0
def main():
    try:
        filename = sys.argv[1]
        f = open(filename, "r")
    except IndexError:
        print(
            "You probably didn't specify an input file. Correct format python3 ass5.py <InputFileName>"
        )
        exit()
    except FileNotFoundError:
        print(
            "The file you specified does not exist. Please check and try again."
        )
        exit()
    inputs = f.readlines()

    jar = './stanford-ner.jar'
    model = './ner-model-english.ser.gz'
    ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

    file2 = open("output.txt", "w")

    for sentence in inputs:
        words = nltk.word_tokenize(sentence)

        for x in ner_tagger.tag(words):
            file2.write("[" + x[0] + ", " + x[1] + "], ")
        file2.write('\n')
def named_entities(sentence):
    st = StanfordNERTagger(
        model_filename=
        '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
        path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    tags = st.tag(word_tokenize(sentence))

    # clean up the result from the tagger
    prev_tag_name = str(tags[0][1])
    cur_entity = str(tags[0][0])
    entities = {}
    for i in range(1, len(tags)):
        cur_tag = tags[i]
        cur_token = str(cur_tag[0])
        cur_tag_name = str(cur_tag[1])
        if cur_tag_name == prev_tag_name:
            cur_entity = cur_entity + " " + cur_token
        else:
            if not prev_tag_name in entities:
                entities[prev_tag_name] = []
            # change encoding, another way is to .encode('ascii','ignore')
            entities[prev_tag_name].append(str(cur_entity))
            cur_entity = cur_token
        prev_tag_name = cur_tag_name
    if 'O' in entities:
        del entities['O']  # not needed, 'O' means not a named entity
    return entities
Beispiel #15
0
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file,
                        output_file):
    jarfile = stanford_dir + jarfile
    modelfile = stanford_dir + modelfile

    st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
    i = 0
    tagged_ne = []
    with open(tag_this_file, "r") as f:
        for line in f:
            line = line.split()
            i += 1
            if len(line) > 0:
                tagged_ne.append(line[0])
            else:
                # Remove the SENENDs from the output file afterwards. Needed to keep the format consistent
                # Keep in mind, that some "/" are still removed. Is replace in postprecessing step.
                tagged_ne.append("SENEND")
    print(tagged_ne)

    # Tag the file using Stanford NER
    out = st.tag(tagged_ne)

    # Write the results to a tsv file
    with open(output_file, "w") as f:
        for i in out:
            f.write(str(i[0]) + "\t" + i[1] + "\n")
def stanford_tag(line):

    java_path = 'C:\Program Files\Java\jdk1.8.0_131\jre.exe'
    nltk.internals.config_java(java_path)
    # NERTagger
    stanford_dir = 'C:\Python\Python36\Lib\stanford-ner-2017-06-09'
    jarfile = stanford_dir + '\stanford-ner.jar'
    modelfile = stanford_dir + '\classifiers\english.conll.4class.distsim.crf.ser.gz'

    st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
    st = StanfordNERTagger(modelfile, stanford_dir + '/stanford-ner.jar')
    tagged_st = st.tag(line)
    #for tag in tagged_st:
    #	if tag[1] in ["PERSON", "LOCATION", "ORGANIZATION"]: print (tag)
    for tag, chunk in groupby(tagged_st, lambda x: x[1]):
        if tag != "O": print("%-12s" % tag, " ".join(w for w, t in chunk))
Beispiel #17
0
    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        n = []
        for nt in NERTags:
            n.extend(nt)

        ids = []
        #get the indexes of all words that have NER tags
        ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"]
        a = np.array(ids)

        consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1)

        phrases = []
        for ci in consecutive_ids:
            phrase = ""
            tag = ""
            for id_ in ci:
                phrase += "{} ".format(n[id_][0])

            tag += "{}".format(n[id_][1])
            phrases.append(phrase)

        cleaned_phrases = self.del_repeat(phrases)
        return cleaned_phrases
Beispiel #18
0
    def out(self):
        stanford_dir = '/home/gary/stanford-ner-2015-04-20/'
        jarfile = stanford_dir + 'stanford-ner.jar'
        modelfile = stanford_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
        st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
        tokenized_text = word_tokenize(self.text)
        classified_text = st.tag(tokenized_text)

        #return organization names
        regexNN = re.compile('ORGANIZATION')
        org_list = list()
        i = 0
        while i < len(classified_text):
            print(i)
            item = classified_text[i]
            if regexNN.search(item[1]):
                n = i + 1
                company = item[0]

                while n < len(classified_text):
                    item = classified_text[n]
                    if regexNN.search(item[1]):
                        company = company + " " + item[0]
                        n = n + 1
                    else:
                        break
                i = n
                org_list.append(company)
            i = i + 1

        return org_list
Beispiel #19
0
def check_ceo(company):
    global REPORTS_pkl, CIKs, Names
    report = get_company_report(company)
    stanford_tagger = StanfordNERTagger(
        'stanford-ner/english.all.3class.distsim.crf.ser.gz',
        'stanford-ner/stanford-ner.jar')
    es = Elasticsearch()

    es.indices.delete("test-index")

    if not es.indices.exists("test-index"):
        data = report.replace("\n", ".\n").replace("\n\n", "\n")
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = tokenizer.tokenize(data)
        for j, sentence in enumerate(sentences):
            doc = {'text': sentence}
            res = es.index(index="test-index", doc_type='text', id=j, body=doc)
    es.indices.refresh(index="test-index")

    text = "ceo"
    ceos = search_for(text, es, stanford_tagger)
    text = "chief executive officer"
    chiefs = search_for(text, es, stanford_tagger)
    ceos = {**ceos, **chiefs}
    ceos = full_names(ceos)
    for (ceo, v) in ceos.items():
        print("{}: {}".format(ceo, v))
    print()
    return ceos.keys()
Beispiel #20
0
 def Name_Entity_recognition2(f2):
     st = StanfordNERTagger(os.environ['STANFORD_CLASSIFIER'],
                            os.environ['STANFORD_NER_PATH'],
                            encoding='utf-8')
     st.java_options ='-mx1000m'
     word = word_tokenize(f2)
     classified_text = st.tag(word)
     return classified_text
Beispiel #21
0
def get_stanford_ner():
    global st
    if st is None:
        st = StanfordNERTagger(
            model_filename=stanford_nlp_model,
            path_to_jar=stanford_nlp_jar,
        )
    return st
Beispiel #22
0
def getWhen(tokenizedSentences):
    os.environ['CLASSPATH'] = directory + "stanford-ner-2015-04-20"
    whenTags = StanfordNERTagger(
        directory +
        'stanford-ner-2015-04-20/classifiers/english.muc.7class.distsim.crf.ser.gz'
    ).tag_sents(tokenizedSentences)
    print(whenTags)
    return whenTags
Beispiel #23
0
 def getNERs(allTokens, pltfrm="linux"):
     nerDict = {"person": "", "org": "", "location": ""}
     nerpath = "nerClassifier/"
     if pltfrm == "windows":
         st = StanfordNERTagger(
             nerpath + "english.all.3class.distsim.crf.ser.gz",
             "stanford-ner-2017-06-09//stanford-ner.jar")
     else:
         st = StanfordNERTagger(
             nerpath + "english.all.3class.distsim.crf.ser.gz",
             "stanford-ner-2017-06-09/stanford-ner.jar")
     tags = st.tag(allTokens)
     #tags_ners = [i for i,j in tags if j != 'O']
     nerDict['person'] = [i for i, j in tags if j == 'PERSON']
     nerDict['org'] = [i for i, j in tags if j == 'ORGANIZATION']
     nerDict['location'] = [i for i, j in tags if j == 'LOCATION']
     return nerDict
Beispiel #24
0
def getNER(tokenizedSentences):
    os.environ['CLASSPATH'] = directory + "stanford-ner-2015-04-20"
    nerTags = StanfordNERTagger(
        directory +
        'stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz'
    ).tag_sents(tokenizedSentences)
    # print(nerTags)
    return nerTags
Beispiel #25
0
 def get_ner_tags(self):
     #import nltk
     #nltk.internals.config_java("C:/Program Files/Java/jre1.8.0_111/bin/java.exe")
     os.environ['CLASSPATH'] = dir + 'stanford-ner'
     return StanfordNERTagger(
         dir +
         'stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'
     ).tag(self.tokenized_text)
Beispiel #26
0
 def __init__(self):
     self.stanford_ner = StanfordNERTagger(
         self.config.model_stanford_filename_ner,
         self.config.model_stanford_path_jar_ner)
     self.stanford_pos = StanfordPOSTagger(
         self.config.model_stanford_filename_pos,
         self.config.model_stanford_path_jar_pos)
     self.stanford_pos.java_options = '-mx8g'
def NERTag(text):
    os.environ['CLASSPATH'] = "C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar"
    os.environ['STANFORD_MODELS'] = "C:/Users/1/James/stanford-parser-full-2015-12-09"
    os.environ['JAVAHOME'] = "C:/Program Files/Java/jdk1.8.0_102"
    ner = StanfordNERTagger('C:/Users/1/James/stanford-ner-2015-12-09/classifiers/'
                            'english.all.3class.distsim.crf.ser.gz',
                   'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar')
    r= ner.tag(text)
    return r
def namedEntityMatch(row):
    #os.environ["JAVA_HOME"]="C:\Program Files\Java\jdk1.8.0_151"
    if "JAVA_HOME" not in os.environ:
        print(
            "Please set the value of JAVA_HOME environment variable, or install java in your machine"
        )
        sys.exit(-1)

    ner = StanfordNERTagger(
        r"stanford-ner-2014-06-16\classifiers\english.all.3class.distsim.crf.ser.gz",
        r"stanford-ner-2014-06-16\stanford-ner.jar")

    ques1Entities = ner.tag(str(row['question1']).lower().split())
    ques2Entities = ner.tag(str(row['question2']).lower().split())
    entityDict1 = {}
    entityDict2 = {}
    for entity in entityDict1:
        if entity[1] != "0":
            if entity[1] in entityDict1:
                entityDict1[entity[1]].append(entity[0])
            else:
                nameList = []
                nameList.append(entity[0])
                entityDict1[entity[1]] = nameList

    for entity in entityDict2:
        if entity[1] != "0":
            if entity[1] in entityDict2:
                entityDict2[entity[1]].append(entity[0])
            else:
                nameList = []
                nameList.append(entity[0])
                entityDict2[entity[1]] = nameList

    if len(entityDict1) == 0 or len(entityDict2) == 0:
        return 0

    totalCount = 0
    matchCount = 0
    for key in entityDict1:
        entityList1 = entityDict1[key]
        if key in entityDict2:
            entityList2 = entityDict2[key]
            for item in entityList1:
                if item in entityList2:
                    matchCount += 1
                totalCount += 1
    for key in entityDict2:
        entityList2 = entityDict2[key]
        if key in entityDict1:
            entityList1 = entityDict1[key]
            for item in entityList2:
                if item in entityList1:
                    matchCount += 1
                totalCount += 1

    return float(matchCount) / float(totalCount)
Beispiel #29
0
def o_tag():
    """Returns a noun with a tag if the tag is unfindable or a location"""
    sttag = StanfordNERTagger(
        'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        'stanford-ner-2014-06-16/stanford-ner-3.4.jar')
    sttags = sttag.tag(get_nouns())
    return [
        sttag for sttag in sttags if sttag[1] == 'O' or sttag[1] == "LOCATION"
    ]
Beispiel #30
0
def NERFunc(file_path,fileName,G):
    os.environ["STANFORD_MODELS"] = os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20")
    st = StanfordNERTagger(os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20","classifiers","english.all.3class.distsim.crf.ser.gz"),
                          os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20","stanford-ner.jar" ))
    #print "Java error path JAVAHOME=" + str(os.environ["JAVAHOME"]) 
    #print "Java error path JAVA_HOME=" + str(os.environ["JAVA_HOME"])
    for index, file_path in enumerate(file_path):
        node_name = fileName[index]
        data = readFromFile(file_path)
        if data is not None:
          
                netagged_words = st.tag(data.split())
                for tag, chunk in groupby(netagged_words, lambda x:x[1]):
                    if tag != "O":
                        entity = " ".join(w for w, t in chunk)
                        if  entity != "":
                            entity = entity.encode('utf-8')
                            entity = re.sub(r'[^\x00-\x7F]+', ' ', entity)
                            entity = entity.lower()
                            no_punctuation = entity.translate(None, string.punctuation)
                            entity=re.sub("[^a-zA-Z]+", " ", no_punctuation)
                            #print("Tag = "+ tag+" entity = "+ entity)
                            #If this topic doesn't exist as a node then add it
                            if entity not in G.nodes():
                                G.add_node(entity)
                                G.node[entity]['type'] = graphConstants.TYPE_NER
                            #If the edge between this doc and entity is already present or not
                            if G.has_edge(node_name,entity) is False:
                                G.add_edge(node_name,entity, weight = 1)
                            else:
                                G[node_name][entity]["weight"] = G[node_name][entity]["weight"] + 1
                                graphUtils.logger.info("entity topic entity = "+entity + " document ="+node_name)
                            if G.has_edge(entity,node_name) is False:
                                G.add_edge(entity,node_name, weight = 1)
                            else:
                                G[entity][node_name]["weight"] = G[entity][node_name]["weight"] + 1
                            
                            topics = entity.split()
                            if(len(topics) > 1):
                                for word in entity.split():
                                    #Only change weight if this topic already exists
                                    if word in G.nodes():
                                        #If the edge between this doc and topic is already present or not
                                        if G.has_edge(node_name,word) is False:
                                            G.add_edge(node_name,word, weight = 1)
                                        else:
                                            G[node_name][word]["weight"] = G[node_name][word]["weight"] + 1
                                        if G.has_edge(word,node_name) is False:
                                            G.add_edge(word,node_name, weight = 1)
                                        else:
                                            G[word][node_name]["weight"] = G[word][node_name]["weight"] + 1
                                        graphUtils.logger.info("entity topic word = "+word + " document ="+node_name)
                                        
                                
                                
    return G