def main(filename): pdf_content = getPDFContent(filename + '.pdf') summy = summarize(pdf_content[1], 0.05) text = re.sub(r'[%s]' % ''.join(map(unichr, range(32) + range(127, 256))), '', pdf_content[1]) tagger = st.StanfordNERTagger( '/home/cgh/PDFile/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') tag_results = tagger.tag(text.split()) names = [] for i in tag_results: try: if i[1] == 'PERSON': names.append(i[0]) except: pass with open(filename + '_structured.txt', 'w') as f: f.write(pdf_content[1]) total_rank = [] for section in get_sections(filename + '_structured.txt'): sec = re.sub( r'[%s]' % ''.join(map(unichr, range(32) + range(127, 256))), '', section) sec_rank_list = score_keyphrases_by_textrank(sec) total_rank.append(sec_rank_list[:8]) words = [] if total_rank == []: sec = re.sub( r'[%s]' % ''.join(map(unichr, range(32) + range(127, 256))), '', pdf_content[1]) sec_rank_list = score_keyphrases_by_textrank(sec) total_rank = sec_rank_list[:15] for ranks in total_rank: words.append(ranks[0]) else: for ranks in total_rank: for i in ranks[:3]: words.append(i[0]) #words.pop(0) words = list(set(words)) ytlinks = [] print words w_1 = words[:len(words) / 2] w_2 = words[len(words) / 2 + 1] ytlinks.append(get_youtube_links(str(w_1).strip('[]'))) ytlinks.append(get_youtube_links(str(w_2).strip('[]'))) return total_rank, summy, names, ytlinks
def __init__(self, f, keywords): s = f.read() self.keywords = keywords self.file = s self.sentences = sent_tokenize(s) self.parser = StanfordParser( "stanford-parser-full-2014-08-27/stanford-parser", "stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models") self.tagger = st.StanfordPOSTagger( "stanford-postagger-full-2014-08-27/models/french.tagger", "stanford-postagger-full-2014-08-27/stanford-postagger.jar") self.ner = st.StanfordNERTagger( "stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz", "stanford-ner-2014-08-27/stanford-ner.jar") self.trees = [] for sent in self.sentences: try: self.trees.append(self.parser.raw_parse(sent)) except OSError: self.trees.append([]) self.words = self.word_tokenize_without_punc(s) self.stemmer = FrenchStemmer() self.stems = [self.stemmer.stem(w) for w in self.words] self.words_sentences = [ self.word_tokenize_without_punc(s) for s in self.sentences ] self.tags = self.tagger.tag(self.words) self.tags_sentences = [ self.tagger.tag([w for w in self.words_sentences[i]]) for i in range(len(self.sentences)) ] self.entities = self.ner.tag(self.words) self.entities_sentences = [ self.ner.tag([w for w in self.words_sentences[i]]) for i in range(len(self.sentences)) ] self.left_subject = defaultdict(lambda: 0) self.left_compl = defaultdict(lambda: 0) self.left_neg_subject = defaultdict(lambda: 0) self.left_neg_compl = defaultdict(lambda: 0) self.right_subject = defaultdict(lambda: 0) self.right_compl = defaultdict(lambda: 0) self.right_neg_subject = defaultdict(lambda: 0) self.right_neg_compl = defaultdict(lambda: 0) self.left_ref = 0 self.right_ref = 0 self.trees_leaves = [] for e in self.trees: res = [] extract_leaves(list(e)[0], res) self.trees_leaves.append(tuple_to_dict(res)) self.extract_keywords()
def clean_data(self, text): if 'strip_urls' in self.attributes: text = rx.strip_urls(text) # stopword language list: 'english', 'french', 'spanish', 'german', 'portuguese' for lang in self.lang: stop_words = stopwords.words(lang) stop_words.extend([ 'brexit', 'twitter', 'tweet', 'euref', 'eureferendum', 'correspondent', 'referendum', 'pic', 'eurefpic', 'eupic', 'com', 'bbc' 'co', 'html', 'tweet', 'página', 'anterior', 'iplayer', 'la', 'pretender', 'pode', 'episode', 'http', 'www', 'javascript', 'que', 'pic', 'de', 'android', 'source', 'medium', 'video', 'mr', 'bloomerg', 'economist', self.media ]) # remove stop words text = [ word for word in text.split() if word.lower() not in stop_words ] if 'stemming' in self.attributes: tagger = PorterStemmer() text = [tagger.stem(w) for w in text] if 'lemmatization' in self.attributes: wordnet_lemmatizer = WordNetLemmatizer() text = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in text] # retrieve only nouns if 'pos_tag' in self.attributes: tagged = pos_tag(text) text = [word for word, pos in tagged if re.findall(r'NN', pos)] if 'ner' in self.attributes: path = os.path.abspath( os.curdir) + '\\utils\\stanford-ner-2018-02-27\\' tagger = stanford.StanfordNERTagger( path + 'classifiers\\english.all.3class.distsim.crf.ser.gz', path + 'stanford-ner.jar') text = tagger.tag(text) text = [word + '_' + entity for word, entity in text] # print(" ".join(text)) if 'w2v' in self.attributes: return text else: return " ".join(text)
def get_ner_sentences(sent_list): tagger_class = "/Users/himanshupal/Downloads/stanford-ner-2017-06-09/classifiers/%s" % ( ner_tag_type['class7']) stf = st.StanfordNERTagger( tagger_class, "/Users/himanshupal/Downloads/stanford-ner-2017-06-09/stanford-ner.jar" ) # tokenized_sents = [word_tokenize(sent) for sent in sent_list] # ner_sents = stf.tag_sents(tokenized_sents) return []
def __init__(self, classifier_choice=2): nerfolderpath = "/home/dicle/Documents/tools/en_stanford_NER/stanford-ner-2015-12-09" ext = ".ser.gz" classifiers = ["english.all.3class.distsim.crf", "english.conll.4class.distsim.crf", "english.muc.7class.distsim.crf" ] nerclassifierpath = os.path.join(nerfolderpath, "classifiers", classifiers[classifier_choice] + ext) nerjarname = "stanford-ner-3.6.0.jar" nerjarpath = os.path.join(nerfolderpath, nerjarname) self.ner_tagger = st.StanfordNERTagger(nerclassifierpath, nerjarpath)
def stanford_tagging(data_loc, idx, tmp_loc): st = stag.StanfordNERTagger('./lib/english.all.3class.distsim.crf.ser.gz', './lib/stanford-ner.jar') for subdir, dirs, files in os.walk(tmp_loc): for file in files: r_file_path = os.path.join(subdir, file) f = open(r_file_path, 'r') w_file_path = data_loc + r_file_path[len(tmp_loc):] if not os.path.exists(os.path.dirname(w_file_path)): os.makedirs(os.path.dirname(w_file_path)) td = open(w_file_path, 'w') for line in f: cPickle.dump(st.tag(line.split()), td) td.close() idx += 1 if idx % 5 == 0: print 'Tagging done for %s files' % str(idx)
def getNamedEntities(self, **kwargs): countTerms = Counter() pronounTermList, filteredListCollection = self.filterData( kwargs['FrequentTerms']) if kwargs['FrequentTerms'] is True: taggerHandle = st.StanfordNERTagger( '/home/harsh/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/home/harsh/stanford-ner-2014-06-16/stanford-ner.jar') entityList = taggerHandle.tag(pronounTermList) finalEntityList = [ term[0] for term in entityList if not term[1] == 'O' ] countTerms.update(finalEntityList) frequentWords = countTerms.most_common(5) # print(frequentWords) return frequentWords, filteredListCollection else: return filteredListCollection
} print('Processing file.') # setting parameters inputFilePath = 'NLP_Challenge_07_Mar.xlsx' inputSheetName = 'Sheet1' inputDataColumn = 'A' tableHasHeader = True outputFilePath = 'NLP_Challenge_07_Mar_Output.xlsx' outputSheetName = 'Sheet1' tokenizer = RegexpTokenizer(r'\w+') # ner settings gzPath = 'stanford-ner\\english.all.3class.distsim.crf.ser.gz' jarPath = 'stanford-ner\\stanford-ner.jar' tagger = st.StanfordNERTagger(gzPath, jarPath) # read input data print('Reading input data.') inputWorkbook = load_workbook(filename=inputFilePath, read_only=False) inputWorksheet = inputWorkbook[inputSheetName] inputWorksheetRowCount = inputWorksheet.max_row inputData = [col.value for col in inputWorksheet['A']] if tableHasHeader and len(inputData) > 0: inputData.pop(0) print('Processing input data.') processedData = [process_data(data) for data in inputData] print('Writing processed data to output excel file.') outputWorkbook = Workbook() outputSheet = outputWorkbook.create_sheet('Output Data', 0)
with open('agriculture.csv') as csvfile: raw = [] data = csv.DictReader(csvfile) for row in data: raw.append(row['text']) ### Removing garbage value from Tweets tokenizer = RegexpTokenizer(r'\w+') ### Tokenizing ,Tagging and removing Stop words from tweets for i in range(len(raw)): clean_token = tokenizer.tokenize(raw[i]) java_path = "C:/Program Files/Java/jdk1.8.0_111/bin" os.environ['JAVAHOME'] = java_path tagger = ST.StanfordNERTagger( '.../stanford-ner-2014-06-16/stanford-ner-2014-06-16/classifiers/english.conll.4class.distsim.crf.ser.gz', '.../stanford-ner-2014-06-16/stanford-ner-2014-06-16/stanford-ner.jar', encoding='utf-8') mytweet_tag = tagger.tag(clean_token) stop = set(stopwords.words('english')) without_stop = [i for i in mytweet_tag if i not in stop] final_words.append(without_stop) ### # Tag tokens with standard NLP BIO tags bio_tagged = [] prev_tag = "O" for i in range(len(final_words)): for token, tag in final_words[i]: if tag == "O": bio_tagged.append((token, tag)) prev_tag = tag continue
import nltk import nltk.tag.stanford as st path_pre = "[your full path for NER package]/stanford-ner-2015-12-09/" st = st.StanfordNERTagger(path_pre+'classifiers/english.all.3class.distsim.crf.ser.gz', path_pre+'stanford-ner.jar') text1 = """Reality checks await for ambitious Liberals""" text2 = """Reality checks await for ambitious Liberals!""" text3 = """Emmanuel means Jesus!""" text4 = """Does Cherry like Ice Cream like Hanhan does?""" text5 = """Reality checks await for ambitious Liberals?""" text6 = """Does Tim Hortons like KFC?""" all_text = [text1, text2, text3, text4, text5, text6] for t in all_text: for sent in nltk.sent_tokenize(t): tokens = nltk.tokenize.word_tokenize(sent) tags = st.tag(tokens) for tag in tags: if tag[1]=='PERSON': print (tag, t)
from tinydb import TinyDB import nltk import nltk.tag.stanford as st # tagger tagger = st.StanfordNERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # tinydb db = TinyDB('characters.json') # reading the book with open('text') as f: text = [l.strip() for l in f][:-10] # retrieving all of the characters people = set() current_index = 0 for i, sentence in enumerate(nltk.sent_tokenize(' '.join(text))): print('sentence n.', i) tokens = nltk.tokenize.word_tokenize(sentence) tags = tagger.tag(tokens) for t in tags: if t[1] == 'PERSON': if t[0] not in people: db.insert({'c': t[0], 'i': current_index}) people.add(t[0]) current_index += 1
import pandas as pd import numpy as np import tensorflow as tf import nltk from nltk.corpus import stopwords from nltk.corpus import wordnet from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.wsd import lesk import numpy as np from scipy.optimize import linear_sum_assignment from nltk import pos_tag, ne_chunk import nltk.tag.stanford as st classifier = '/home/gautam/Desktop/Courses/MTL785/project/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz' jar = '/home/gautam/Desktop/Courses/MTL785/project/stanford-ner-2017-06-09/stanford-ner.jar' s = st.StanfordNERTagger(classifier, jar) # nltk.download('wordnet') stemmer = PorterStemmer() lemmatiser = WordNetLemmatizer() stop_word = set(stopwords.words('english')) df = pd.read_csv('data/train.csv') # print(df.columns.values) question2_total = df.iloc[:, 4].values question1_total = df.iloc[:, 3].values # question1_total = ['what is your name'] # question2_total = ['what should I call you'] # print(question1_total) question1 = word_tokenize(question1_total[0]) question2 = word_tokenize(question2_total[0]) print(question1)
""" This file contains functions required for extracting Person names """ import nltk import nltk.tag.stanford as stag from nameparser.parser import HumanName from nltk import pos_tag from nltk.chunk import conlltags2tree from nltk.tree import Tree # Please install NLTK and download corresponding files tagger = stag.StanfordNERTagger('/Users/soumya/Documents/Mannheim-Data-Science/Sem 2/Team project/WikiCfp/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/soumya/Documents/Mannheim-Data-Science/Sem 2/Team project/WikiCfp/stanford-ner-2018-10-16/stanford-ner.jar') def stanfordNE2BIO(tagged_sent): """ Function converts the Named Entity tagged sentence to BIO(Beginning Inside Outside) tagged sentence Parameters ---------- tagged_sent : list Sentence tagged by Standford NER tagger Returns ------- list Sentence tagged in BIO format """ bio_tagged_sent = [] prev_tag = "O"
from itertools import chain import nltk from sklearn.metrics import classification_report, confusion_matrix from sklearn.preprocessing import LabelBinarizer import sklearn import nltk import pandas as pd sent = [] a = pd.read_csv('t11', sep=" ", header=None) all_words = a.as_matrix() import nltk.tag.stanford as st PATH_TO_GZ = 'C:/Users/Oma/Desktop/Desktop/Fall 2016/NLP/project/tagset/tagger-master/dataset/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz' PATH_TO_JAR = 'C:/Users/Oma/Desktop/Desktop/Fall 2016/NLP/project/tagset/tagger-master/dataset/stanford-ner/stanford-ner.jar' tagger = st.StanfordNERTagger(PATH_TO_GZ, PATH_TO_JAR) sent = [] train_sents = [] for i in range(0, 100): #print(i) if (all_words[i][0] == 'end_of_sentence'): train_sents.append(list(sent)) sent = [] else: sent.append(all_words[i]) print("COmplete stage1")
@author: jogr0001 ''' import nltk import pickle from unidecode import unidecode import re import nltk.tag.stanford as st import os import time from information_retrieval.wikipedia_data import has_wikipedia_page java_path = "C:/Program Files/Java/jdk1.8.0_101/bin/java.exe" os.environ['JAVAHOME'] = java_path tagger = st.StanfordNERTagger( "C:/Users/cano2247/Downloads/stanford-ner-2015-12-09/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz", "C:/Users/cano2247/Downloads/stanford-ner-2015-12-09/stanford-ner-2015-12-09/stanford-ner.jar" ) sents = pickle.load(open('../garnissage_pdfs_en.pkl', 'rb')) def ner_recognition(): sent = [nltk.sent_tokenize(sent) for sent in sents] tags = {} list_ner = [] for s in sent: s = [nltk.word_tokenize(phrase) for phrase in s] ner_tags_stanford = tagger.tag_sents(s) list_ner += ner_tags_stanford print(len(list_ner)) for ner_tags in list_ner: #print(unidecode(str(ner_tags)))
print('loaded') data = [] for item in train: data.append((item.split('\t'))) with open("test.csv", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') csvWriter.writerow(['word', 'tag', ' ']) csvWriter.writerows(data) test = pd.read_csv('test.csv') test = test.drop(' ', axis=1) # print(test.shape) tagger = st.StanfordNERTagger('/home/mma137421/stanford-ner/ner-model.ser.gz', '/home/mma137421/stanford-ner/stanford-ner.jar') x_test = test['word'].tolist() y_test = test['tag'].tolist() print(len(x_test)) # print(x_test) sentence = list() sentences = list() output = list() x = list() for xx in x_test: x.append(str(xx)) print(x) print(type(x)) predict = tagger.tag(x)
import json, re import nltk.tag.stanford as st from itertools import groupby input_file = open("./data/job_ads.json", "r") #json file with job postings output_file = open("./data/job_ads_with_tags.json", "w") #output file for line in input_file: line = unicode(line, "utf-8") job_title = json.loads(line)['_source']['doc']['title'] #get the job title tagged_title = '' tagger = st.StanfordNERTagger('./model/ner-model_titles.ser.gz', './model/stanford-ner.jar') #load the tagger netagged_words = tagger.tag( job_title.encode('utf-8').split()) #list of all the words in the title for tag, chunk in groupby(netagged_words, lambda x: x[1]): word = " ".join(w for w, t in chunk) #get word from the title if tag == "ROLE": #if the tag of the word is 'ROLE' word = " <START:" + tag + ">" + word + "<END> " #tag the word tagged_title = tagged_title + word print('JOB_ID: ' + str(json.loads(line)['_source']['doc']['jobid'])) print('ORIGINAL_TITLE: ' + str(job_title.encode('utf-8'))) print('TAGGED_TITLE: ' + str(tagged_title.encode('utf-8')) + '\n') line = re.sub( job_title.encode('utf-8'), tagged_title.encode('utf-8'),
current_chunk = [] for token, tag in tagged_sent: if tag != "O": current_chunk.append((token, tag)) else: if current_chunk: # if the current chunk is not empty continuous_chunk.append(current_chunk) current_chunk = [] # Flush the final current_chunk into the continuous_chunk, if any. if current_chunk: continuous_chunk.append(current_chunk) return continuous_chunk # Initialize stanford tagger model stner = st.StanfordNERTagger( '/home/NLP/stanford-ner-2015-04-20/classifiers/english.muc.7class.distsim.crf.ser.gz', '/home/NLP/stanford-ner-2015-04-20/stanford-ner.jar') with open('data/extract_entities.txt', 'r') as f: text = f.read() text = space_out_punctuation(text) tagged_sent = stner.tag(text.split()) named_entities = get_continuous_chunks(tagged_sent) named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities] print(named_entities_str_tag)
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk import nltk.tag.stanford as st st = st.StanfordNERTagger( 'standford-ner/classifiers/Indonesian_Manually_Tagged_Corpus_ID.ser.gz', 'standford-ner/stanford-ner.jar') text = """ Setelah itu, mereka masuk ke Jalan Raya Campaka-Ramayana Sadang, Kabupaten Purwakarta. Kemudian putar balik ke Jalan Campaka-Cipeundeuy-Kalijati-Otto Iskandardinata-Ahmad Yani-Jalan Raya Cijambe dan finish di Kantor Kecamatan Jalan Cagak. Harian Detik hari melaporkan ASIAN GAMES yang diselenggarakan di indonesia, dan pada acara balap sepeda dengan route karawang purwakarta subang dengan titik lokasi -6.571589, 107.758736. Di awal balapan, pebalap Indonesia, Aiman Cahyadi, Jamal Hibatullah, Dadi Suryadi, dan Robin Manullang sudah keteteran bersaing dengan pebalap lain. Mereka harus mengakui keunggulan dari pebalap Kazakhstan, Korea Selatan, Jepang dan negara lainnya. Memasuki 10 km terakhir, atlet Indonesia mencoba mempercepat laju sepedanya. Tapi, lagi-lagi tidak bisa mengimbangi keperkasaan atlet Kazaktan yang memang diunggulkan. Empat atlet Indonesia akhirnya hanya mampu menyentuh garis finis di urutan kesembilan atas nama Aiman Cahyadi dengan catatan waktu 3 jam 26,1 detik disusul Robin Manullang dengan catatan waktu yang sama. Sementara, dua pebalap Indonesia lainnya Dadi Suryadi harus puas di urutan 19 dan Jamal Hibatullah di urutan 34 dengan catatan waktu masing-masing 3:27:45 dan 5:30:40. """ unicode(text, errors='ignore') for sent in nltk.sent_tokenize(unicode(text, errors='ignore')): tokens = nltk.tokenize.word_tokenize(sent) tags = st.tag(tokens) for tag in tags: print(tag) if tag[1] == 'PERSON': print tag if tag[1] == 'LOCATION': print tag if tag[1] == 'ORGANIZATION': print tag if tag[1] == 'TIME': print tag if tag[1] == 'NUMBER': print tag if tag[1] == 'REGION': print tag if tag[1] == 'COORDINATES': print tag if tag[1] == 'CITY': print tag
# !/usr/bin/env python -W ignore::DeprecationWarning import pandas as pd, numpy as np import nltk from itertools import chain import re import nltk import nltk.tag.stanford as st import os tagger = st.StanfordNERTagger( '../../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../../stanford-ner/stanford-ner.jar') def get_continuous_chunks(tag2, tagged_sent): continuous_chunk = [] current_chunk = [] for token, tag in tagged_sent: if tag == tag2: # if tag == "PERSON": current_chunk.append((token, tag)) else: if current_chunk: # if the current chunk is not empty continuous_chunk.append(current_chunk) current_chunk = [] # Flush the final current_chunk into the continuous_chunk, if any. if current_chunk: continuous_chunk.append(current_chunk) return continuous_chunk