Example #1
0
def main(filename):
    pdf_content = getPDFContent(filename + '.pdf')
    summy = summarize(pdf_content[1], 0.05)
    text = re.sub(r'[%s]' % ''.join(map(unichr,
                                        range(32) + range(127, 256))), '',
                  pdf_content[1])

    tagger = st.StanfordNERTagger(
        '/home/cgh/PDFile/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
        '/usr/share/stanford-ner/stanford-ner.jar')
    tag_results = tagger.tag(text.split())

    names = []
    for i in tag_results:
        try:
            if i[1] == 'PERSON':
                names.append(i[0])
        except:
            pass

    with open(filename + '_structured.txt', 'w') as f:
        f.write(pdf_content[1])

    total_rank = []
    for section in get_sections(filename + '_structured.txt'):
        sec = re.sub(
            r'[%s]' % ''.join(map(unichr,
                                  range(32) + range(127, 256))), '', section)
        sec_rank_list = score_keyphrases_by_textrank(sec)
        total_rank.append(sec_rank_list[:8])

    words = []
    if total_rank == []:
        sec = re.sub(
            r'[%s]' % ''.join(map(unichr,
                                  range(32) + range(127, 256))), '',
            pdf_content[1])
        sec_rank_list = score_keyphrases_by_textrank(sec)
        total_rank = sec_rank_list[:15]

        for ranks in total_rank:
            words.append(ranks[0])
    else:
        for ranks in total_rank:
            for i in ranks[:3]:
                words.append(i[0])

    #words.pop(0)
    words = list(set(words))
    ytlinks = []
    print words
    w_1 = words[:len(words) / 2]
    w_2 = words[len(words) / 2 + 1]

    ytlinks.append(get_youtube_links(str(w_1).strip('[]')))
    ytlinks.append(get_youtube_links(str(w_2).strip('[]')))

    return total_rank, summy, names, ytlinks
    def __init__(self, f, keywords):
        s = f.read()
        self.keywords = keywords
        self.file = s
        self.sentences = sent_tokenize(s)
        self.parser = StanfordParser(
            "stanford-parser-full-2014-08-27/stanford-parser",
            "stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models")
        self.tagger = st.StanfordPOSTagger(
            "stanford-postagger-full-2014-08-27/models/french.tagger",
            "stanford-postagger-full-2014-08-27/stanford-postagger.jar")
        self.ner = st.StanfordNERTagger(
            "stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz",
            "stanford-ner-2014-08-27/stanford-ner.jar")

        self.trees = []
        for sent in self.sentences:
            try:
                self.trees.append(self.parser.raw_parse(sent))
            except OSError:
                self.trees.append([])
        self.words = self.word_tokenize_without_punc(s)
        self.stemmer = FrenchStemmer()
        self.stems = [self.stemmer.stem(w) for w in self.words]
        self.words_sentences = [
            self.word_tokenize_without_punc(s) for s in self.sentences
        ]
        self.tags = self.tagger.tag(self.words)
        self.tags_sentences = [
            self.tagger.tag([w for w in self.words_sentences[i]])
            for i in range(len(self.sentences))
        ]
        self.entities = self.ner.tag(self.words)
        self.entities_sentences = [
            self.ner.tag([w for w in self.words_sentences[i]])
            for i in range(len(self.sentences))
        ]
        self.left_subject = defaultdict(lambda: 0)
        self.left_compl = defaultdict(lambda: 0)
        self.left_neg_subject = defaultdict(lambda: 0)
        self.left_neg_compl = defaultdict(lambda: 0)
        self.right_subject = defaultdict(lambda: 0)
        self.right_compl = defaultdict(lambda: 0)
        self.right_neg_subject = defaultdict(lambda: 0)
        self.right_neg_compl = defaultdict(lambda: 0)
        self.left_ref = 0
        self.right_ref = 0
        self.trees_leaves = []
        for e in self.trees:
            res = []
            extract_leaves(list(e)[0], res)
            self.trees_leaves.append(tuple_to_dict(res))
        self.extract_keywords()
    def clean_data(self, text):

        if 'strip_urls' in self.attributes:
            text = rx.strip_urls(text)

        # stopword language list: 'english', 'french', 'spanish', 'german', 'portuguese'
        for lang in self.lang:
            stop_words = stopwords.words(lang)

        stop_words.extend([
            'brexit', 'twitter', 'tweet', 'euref', 'eureferendum',
            'correspondent', 'referendum', 'pic', 'eurefpic', 'eupic', 'com',
            'bbc'
            'co', 'html', 'tweet', 'página', 'anterior', 'iplayer', 'la',
            'pretender', 'pode', 'episode', 'http', 'www', 'javascript', 'que',
            'pic', 'de', 'android', 'source', 'medium', 'video', 'mr',
            'bloomerg', 'economist', self.media
        ])

        # remove stop words
        text = [
            word for word in text.split() if word.lower() not in stop_words
        ]

        if 'stemming' in self.attributes:
            tagger = PorterStemmer()
            text = [tagger.stem(w) for w in text]

        if 'lemmatization' in self.attributes:
            wordnet_lemmatizer = WordNetLemmatizer()
            text = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in text]

        # retrieve only nouns
        if 'pos_tag' in self.attributes:
            tagged = pos_tag(text)
            text = [word for word, pos in tagged if re.findall(r'NN', pos)]

        if 'ner' in self.attributes:
            path = os.path.abspath(
                os.curdir) + '\\utils\\stanford-ner-2018-02-27\\'
            tagger = stanford.StanfordNERTagger(
                path + 'classifiers\\english.all.3class.distsim.crf.ser.gz',
                path + 'stanford-ner.jar')

            text = tagger.tag(text)
            text = [word + '_' + entity for word, entity in text]

        # print(" ".join(text))
        if 'w2v' in self.attributes:
            return text
        else:
            return " ".join(text)
def get_ner_sentences(sent_list):
    tagger_class = "/Users/himanshupal/Downloads/stanford-ner-2017-06-09/classifiers/%s" % (
        ner_tag_type['class7'])

    stf = st.StanfordNERTagger(
        tagger_class,
        "/Users/himanshupal/Downloads/stanford-ner-2017-06-09/stanford-ner.jar"
    )

    # tokenized_sents = [word_tokenize(sent) for sent in sent_list]
    # ner_sents = stf.tag_sents(tokenized_sents)

    return []
Example #5
0
 def __init__(self, classifier_choice=2):
     
     nerfolderpath = "/home/dicle/Documents/tools/en_stanford_NER/stanford-ner-2015-12-09"
     ext = ".ser.gz"
     classifiers = ["english.all.3class.distsim.crf",
                    "english.conll.4class.distsim.crf",
                    "english.muc.7class.distsim.crf"
                    ]
     nerclassifierpath = os.path.join(nerfolderpath, "classifiers", classifiers[classifier_choice] + ext)
     
     nerjarname = "stanford-ner-3.6.0.jar"
     nerjarpath = os.path.join(nerfolderpath, nerjarname)
     self.ner_tagger = st.StanfordNERTagger(nerclassifierpath, nerjarpath)
Example #6
0
def stanford_tagging(data_loc, idx, tmp_loc):
    st = stag.StanfordNERTagger('./lib/english.all.3class.distsim.crf.ser.gz',
                                './lib/stanford-ner.jar')
    for subdir, dirs, files in os.walk(tmp_loc):
        for file in files:
            r_file_path = os.path.join(subdir, file)
            f = open(r_file_path, 'r')

            w_file_path = data_loc + r_file_path[len(tmp_loc):]

            if not os.path.exists(os.path.dirname(w_file_path)):
                os.makedirs(os.path.dirname(w_file_path))

            td = open(w_file_path, 'w')
            for line in f:
                cPickle.dump(st.tag(line.split()), td)
            td.close()
            idx += 1

            if idx % 5 == 0:
                print 'Tagging done for %s files' % str(idx)
Example #7
0
    def getNamedEntities(self, **kwargs):

        countTerms = Counter()
        pronounTermList, filteredListCollection = self.filterData(
            kwargs['FrequentTerms'])

        if kwargs['FrequentTerms'] is True:
            taggerHandle = st.StanfordNERTagger(
                '/home/harsh/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                '/home/harsh/stanford-ner-2014-06-16/stanford-ner.jar')

            entityList = taggerHandle.tag(pronounTermList)
            finalEntityList = [
                term[0] for term in entityList if not term[1] == 'O'
            ]
            countTerms.update(finalEntityList)
            frequentWords = countTerms.most_common(5)
            # print(frequentWords)
            return frequentWords, filteredListCollection
        else:
            return filteredListCollection
    }


print('Processing file.')
# setting parameters
inputFilePath = 'NLP_Challenge_07_Mar.xlsx'
inputSheetName = 'Sheet1'
inputDataColumn = 'A'
tableHasHeader = True
outputFilePath = 'NLP_Challenge_07_Mar_Output.xlsx'
outputSheetName = 'Sheet1'
tokenizer = RegexpTokenizer(r'\w+')
# ner settings
gzPath = 'stanford-ner\\english.all.3class.distsim.crf.ser.gz'
jarPath = 'stanford-ner\\stanford-ner.jar'
tagger = st.StanfordNERTagger(gzPath, jarPath)

# read input data
print('Reading input data.')
inputWorkbook = load_workbook(filename=inputFilePath, read_only=False)
inputWorksheet = inputWorkbook[inputSheetName]
inputWorksheetRowCount = inputWorksheet.max_row
inputData = [col.value for col in inputWorksheet['A']]
if tableHasHeader and len(inputData) > 0:
    inputData.pop(0)
print('Processing input data.')
processedData = [process_data(data) for data in inputData]

print('Writing processed data to output excel file.')
outputWorkbook = Workbook()
outputSheet = outputWorkbook.create_sheet('Output Data', 0)
with open('agriculture.csv') as csvfile:
    raw = []
    data = csv.DictReader(csvfile)
    for row in data:
        raw.append(row['text'])

### Removing garbage value from Tweets
tokenizer = RegexpTokenizer(r'\w+')

### Tokenizing ,Tagging and removing Stop words from tweets
for i in range(len(raw)):
    clean_token = tokenizer.tokenize(raw[i])
    java_path = "C:/Program Files/Java/jdk1.8.0_111/bin"
    os.environ['JAVAHOME'] = java_path
    tagger = ST.StanfordNERTagger(
        '.../stanford-ner-2014-06-16/stanford-ner-2014-06-16/classifiers/english.conll.4class.distsim.crf.ser.gz',
        '.../stanford-ner-2014-06-16/stanford-ner-2014-06-16/stanford-ner.jar',
        encoding='utf-8')
    mytweet_tag = tagger.tag(clean_token)
    stop = set(stopwords.words('english'))
    without_stop = [i for i in mytweet_tag if i not in stop]
    final_words.append(without_stop)

### # Tag tokens with standard NLP BIO tags
bio_tagged = []
prev_tag = "O"
for i in range(len(final_words)):
    for token, tag in final_words[i]:
        if tag == "O":
            bio_tagged.append((token, tag))
            prev_tag = tag
            continue
Example #10
0
import nltk
import nltk.tag.stanford as st

path_pre = "[your full path for NER package]/stanford-ner-2015-12-09/"

st = st.StanfordNERTagger(path_pre+'classifiers/english.all.3class.distsim.crf.ser.gz', path_pre+'stanford-ner.jar')

text1 = """Reality checks await for ambitious Liberals"""
text2 = """Reality checks await for ambitious Liberals!"""
text3 = """Emmanuel means Jesus!"""
text4 = """Does Cherry like Ice Cream like Hanhan does?"""
text5 = """Reality checks await for ambitious Liberals?"""
text6 = """Does Tim Hortons like KFC?"""


all_text = [text1, text2, text3, text4, text5, text6]

for t in all_text:
    for sent in nltk.sent_tokenize(t):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = st.tag(tokens)
        for tag in tags:
            if tag[1]=='PERSON': print (tag, t)

Example #11
0
from tinydb import TinyDB
import nltk
import nltk.tag.stanford as st

# tagger
tagger = st.StanfordNERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar')

# tinydb
db = TinyDB('characters.json')

# reading the book
with open('text') as f:
	text = [l.strip() for l in f][:-10]

# retrieving all of the characters
people = set()
current_index = 0

for i, sentence in enumerate(nltk.sent_tokenize(' '.join(text))):
	print('sentence n.', i)

	tokens = nltk.tokenize.word_tokenize(sentence)
	tags = tagger.tag(tokens)

	for t in tags:
		if t[1] == 'PERSON':
			if t[0] not in people:
				db.insert({'c': t[0], 'i': current_index})
				people.add(t[0])
				current_index += 1
Example #12
0
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.wsd import lesk
import numpy as np
from scipy.optimize import linear_sum_assignment
from nltk import pos_tag, ne_chunk
import nltk.tag.stanford as st
classifier = '/home/gautam/Desktop/Courses/MTL785/project/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = '/home/gautam/Desktop/Courses/MTL785/project/stanford-ner-2017-06-09/stanford-ner.jar'
s = st.StanfordNERTagger(classifier, jar)
# nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
stop_word = set(stopwords.words('english'))

df = pd.read_csv('data/train.csv')
# print(df.columns.values)
question2_total = df.iloc[:, 4].values
question1_total = df.iloc[:, 3].values
# question1_total = ['what is your name']
# question2_total = ['what should I call you']
# print(question1_total)
question1 = word_tokenize(question1_total[0])
question2 = word_tokenize(question2_total[0])
print(question1)
"""
This file contains functions required for extracting Person names
"""
import nltk
import nltk.tag.stanford as stag
from nameparser.parser import HumanName
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

# Please install NLTK and download corresponding files
tagger = stag.StanfordNERTagger('/Users/soumya/Documents/Mannheim-Data-Science/Sem 2/Team project/WikiCfp/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/soumya/Documents/Mannheim-Data-Science/Sem 2/Team project/WikiCfp/stanford-ner-2018-10-16/stanford-ner.jar')


def stanfordNE2BIO(tagged_sent):
    """
    Function converts the Named Entity tagged sentence to BIO(Beginning Inside Outside) tagged sentence

    Parameters
    ----------
    tagged_sent : list
        Sentence tagged by Standford NER tagger

    Returns
    -------
    list
        Sentence tagged in BIO format

    """
    bio_tagged_sent = []
    prev_tag = "O"
Example #14
0
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn

import nltk
import pandas as pd
sent = []
a = pd.read_csv('t11', sep=" ", header=None)

all_words = a.as_matrix()
import nltk.tag.stanford as st
PATH_TO_GZ = 'C:/Users/Oma/Desktop/Desktop/Fall 2016/NLP/project/tagset/tagger-master/dataset/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'
PATH_TO_JAR = 'C:/Users/Oma/Desktop/Desktop/Fall 2016/NLP/project/tagset/tagger-master/dataset/stanford-ner/stanford-ner.jar'
tagger = st.StanfordNERTagger(PATH_TO_GZ, PATH_TO_JAR)

sent = []
train_sents = []

for i in range(0, 100):
    #print(i)
    if (all_words[i][0] == 'end_of_sentence'):
        train_sents.append(list(sent))
        sent = []
    else:
        sent.append(all_words[i])
print("COmplete stage1")
Example #15
0
@author: jogr0001
'''
import nltk
import pickle
from unidecode import unidecode
import re
import nltk.tag.stanford as st
import os
import time
from information_retrieval.wikipedia_data import has_wikipedia_page

java_path = "C:/Program Files/Java/jdk1.8.0_101/bin/java.exe"
os.environ['JAVAHOME'] = java_path
tagger = st.StanfordNERTagger(
    "C:/Users/cano2247/Downloads/stanford-ner-2015-12-09/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz",
    "C:/Users/cano2247/Downloads/stanford-ner-2015-12-09/stanford-ner-2015-12-09/stanford-ner.jar"
)
sents = pickle.load(open('../garnissage_pdfs_en.pkl', 'rb'))


def ner_recognition():
    sent = [nltk.sent_tokenize(sent) for sent in sents]
    tags = {}
    list_ner = []
    for s in sent:
        s = [nltk.word_tokenize(phrase) for phrase in s]
        ner_tags_stanford = tagger.tag_sents(s)
        list_ner += ner_tags_stanford
    print(len(list_ner))
    for ner_tags in list_ner:
        #print(unidecode(str(ner_tags)))
Example #16
0
print('loaded')

data = []
for item in train:
    data.append((item.split('\t')))

with open("test.csv", "w+") as my_csv:
    csvWriter = csv.writer(my_csv, delimiter=',')
    csvWriter.writerow(['word', 'tag', ' '])
    csvWriter.writerows(data)

test = pd.read_csv('test.csv')
test = test.drop(' ', axis=1)
# print(test.shape)

tagger = st.StanfordNERTagger('/home/mma137421/stanford-ner/ner-model.ser.gz',
                              '/home/mma137421/stanford-ner/stanford-ner.jar')

x_test = test['word'].tolist()
y_test = test['tag'].tolist()
print(len(x_test))
# print(x_test)

sentence = list()
sentences = list()
output = list()
x = list()
for xx in x_test:
    x.append(str(xx))
print(x)
print(type(x))
predict = tagger.tag(x)
Example #17
0
import json, re
import nltk.tag.stanford as st
from itertools import groupby

input_file = open("./data/job_ads.json", "r")  #json file with job postings
output_file = open("./data/job_ads_with_tags.json", "w")  #output file

for line in input_file:

    line = unicode(line, "utf-8")

    job_title = json.loads(line)['_source']['doc']['title']  #get the job title

    tagged_title = ''
    tagger = st.StanfordNERTagger('./model/ner-model_titles.ser.gz',
                                  './model/stanford-ner.jar')  #load the tagger
    netagged_words = tagger.tag(
        job_title.encode('utf-8').split())  #list of all the words in the title

    for tag, chunk in groupby(netagged_words, lambda x: x[1]):
        word = " ".join(w for w, t in chunk)  #get word from the title
        if tag == "ROLE":  #if the tag of the word is 'ROLE'
            word = " <START:" + tag + ">" + word + "<END> "  #tag the word
        tagged_title = tagged_title + word

    print('JOB_ID: ' + str(json.loads(line)['_source']['doc']['jobid']))
    print('ORIGINAL_TITLE: ' + str(job_title.encode('utf-8')))
    print('TAGGED_TITLE: ' + str(tagged_title.encode('utf-8')) + '\n')

    line = re.sub(
        job_title.encode('utf-8'), tagged_title.encode('utf-8'),
    current_chunk = []

    for token, tag in tagged_sent:
        if tag != "O":
            current_chunk.append((token, tag))
        else:
            if current_chunk:  # if the current chunk is not empty
                continuous_chunk.append(current_chunk)
                current_chunk = []
    # Flush the final current_chunk into the continuous_chunk, if any.
    if current_chunk:
        continuous_chunk.append(current_chunk)
    return continuous_chunk


# Initialize stanford tagger model
stner = st.StanfordNERTagger(
    '/home/NLP/stanford-ner-2015-04-20/classifiers/english.muc.7class.distsim.crf.ser.gz',
    '/home/NLP/stanford-ner-2015-04-20/stanford-ner.jar')

with open('data/extract_entities.txt', 'r') as f:
    text = f.read()
    text = space_out_punctuation(text)
    tagged_sent = stner.tag(text.split())

    named_entities = get_continuous_chunks(tagged_sent)
    named_entities_str_tag = [(" ".join([token
                                         for token, tag in ne]), ne[0][1])
                              for ne in named_entities]
    print(named_entities_str_tag)
Example #19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
import nltk.tag.stanford as st

st = st.StanfordNERTagger(
    'standford-ner/classifiers/Indonesian_Manually_Tagged_Corpus_ID.ser.gz',
    'standford-ner/stanford-ner.jar')
text = """
Setelah itu, mereka masuk ke Jalan Raya Campaka-Ramayana Sadang, Kabupaten Purwakarta. Kemudian putar balik ke Jalan Campaka-Cipeundeuy-Kalijati-Otto Iskandardinata-Ahmad Yani-Jalan Raya Cijambe dan finish di Kantor Kecamatan Jalan Cagak.
Harian Detik hari melaporkan ASIAN GAMES yang diselenggarakan di indonesia, dan pada acara balap sepeda dengan route karawang purwakarta subang dengan titik lokasi -6.571589, 107.758736.
Di awal balapan, pebalap Indonesia, Aiman Cahyadi, Jamal Hibatullah, Dadi Suryadi, dan Robin Manullang sudah keteteran bersaing dengan pebalap lain. Mereka harus mengakui keunggulan dari pebalap Kazakhstan, Korea Selatan, Jepang dan negara lainnya.
Memasuki 10 km terakhir, atlet Indonesia mencoba mempercepat laju sepedanya. Tapi, lagi-lagi tidak bisa mengimbangi keperkasaan atlet Kazaktan yang memang diunggulkan.
Empat atlet Indonesia akhirnya hanya mampu menyentuh garis finis di urutan kesembilan atas nama Aiman Cahyadi dengan catatan waktu 3 jam 26,1 detik disusul Robin Manullang dengan catatan waktu yang sama.
Sementara, dua pebalap Indonesia lainnya Dadi Suryadi harus puas di urutan 19 dan Jamal Hibatullah di urutan 34 dengan catatan waktu masing-masing 3:27:45 dan 5:30:40. 
"""
unicode(text, errors='ignore')
for sent in nltk.sent_tokenize(unicode(text, errors='ignore')):
    tokens = nltk.tokenize.word_tokenize(sent)
    tags = st.tag(tokens)
    for tag in tags:
        print(tag)
        if tag[1] == 'PERSON': print tag
        if tag[1] == 'LOCATION': print tag
        if tag[1] == 'ORGANIZATION': print tag
        if tag[1] == 'TIME': print tag
        if tag[1] == 'NUMBER': print tag
        if tag[1] == 'REGION': print tag
        if tag[1] == 'COORDINATES': print tag
        if tag[1] == 'CITY': print tag
Example #20
0
# !/usr/bin/env python -W ignore::DeprecationWarning
import pandas as pd, numpy as np
import nltk
from itertools import chain
import re
import nltk
import nltk.tag.stanford as st
import os

tagger = st.StanfordNERTagger(
    '../../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
    '../../stanford-ner/stanford-ner.jar')


def get_continuous_chunks(tag2, tagged_sent):
    continuous_chunk = []
    current_chunk = []

    for token, tag in tagged_sent:
        if tag == tag2:
            # if tag == "PERSON":
            current_chunk.append((token, tag))
        else:
            if current_chunk:  # if the current chunk is not empty
                continuous_chunk.append(current_chunk)
                current_chunk = []
    # Flush the final current_chunk into the continuous_chunk, if any.
    if current_chunk:
        continuous_chunk.append(current_chunk)
    return continuous_chunk