def __init__(self, f, keywords): s = f.read() self.keywords = keywords self.file = s self.sentences = sent_tokenize(s) self.parser = StanfordParser( "stanford-parser-full-2014-08-27/stanford-parser", "stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models") self.tagger = st.StanfordPOSTagger( "stanford-postagger-full-2014-08-27/models/french.tagger", "stanford-postagger-full-2014-08-27/stanford-postagger.jar") self.ner = st.StanfordNERTagger( "stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz", "stanford-ner-2014-08-27/stanford-ner.jar") self.trees = [] for sent in self.sentences: try: self.trees.append(self.parser.raw_parse(sent)) except OSError: self.trees.append([]) self.words = self.word_tokenize_without_punc(s) self.stemmer = FrenchStemmer() self.stems = [self.stemmer.stem(w) for w in self.words] self.words_sentences = [ self.word_tokenize_without_punc(s) for s in self.sentences ] self.tags = self.tagger.tag(self.words) self.tags_sentences = [ self.tagger.tag([w for w in self.words_sentences[i]]) for i in range(len(self.sentences)) ] self.entities = self.ner.tag(self.words) self.entities_sentences = [ self.ner.tag([w for w in self.words_sentences[i]]) for i in range(len(self.sentences)) ] self.left_subject = defaultdict(lambda: 0) self.left_compl = defaultdict(lambda: 0) self.left_neg_subject = defaultdict(lambda: 0) self.left_neg_compl = defaultdict(lambda: 0) self.right_subject = defaultdict(lambda: 0) self.right_compl = defaultdict(lambda: 0) self.right_neg_subject = defaultdict(lambda: 0) self.right_neg_compl = defaultdict(lambda: 0) self.left_ref = 0 self.right_ref = 0 self.trees_leaves = [] for e in self.trees: res = [] extract_leaves(list(e)[0], res) self.trees_leaves.append(tuple_to_dict(res)) self.extract_keywords()
def extract_words_plus_pos_tags(texts, lang): results = [] if lang in stanford_lang_models: import nltk.tag.stanford as stanford_tagger tagger = stanford_tagger.StanfordPOSTagger( stanford_res_path + stanford_lang_models[lang], path_to_jar=stanford_res_path + "stanford-postagger.jar") results = tagger.tag(word_tokenize(texts, language=lang_map[lang])) if lang == 'en': # convert eng tags to universal tags results = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in results] return results
def train(self): # _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_model = home + '../../stanford-postagger/models/english-left3words-distsim.tagger' _path_to_jar = home + '../../stanford-postagger/stanford-postagger.jar' self.st = stanford.StanfordPOSTagger(_path_to_model, _path_to_jar)
from nltk.corpus import stopwords os.environ[ 'STANFORD_MODELS'] = 'stanford-segmenter-2018-10-16/data/;stanford-postagger-full-2018-10-16/models/' os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2018-10-17' os.environ['CLASSPATH'] = 'stanford-parser-full-2018-10-17' os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-11.0.1' segmenter = StanfordSegmenter( 'stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar') segmenter.default_config('ar') text = segmenter.segment_file('sample.txt') print(text) tagger = STag.StanfordPOSTagger( 'arabic.tagger', 'stanford-postagger-full-2018-10-16/stanford-postagger.jar') for tag in tagger.tag(text.split()): print(tag[1]) parser = SParse.StanfordParser( model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz') sentences = parser.raw_parse_sents(text.split('.')) for line in sentences: for sentence in line: print(sentence) sentence.draw() ner = Text(text) for sent in ner.sentences: print(sent)
import os import sys sys.path.append("../") import config.data_config as data_cfg import utils.gen_utils as gen_utils import utils.data_utils as data_utils from nltk.tag import stanford import threading import nltk lst_fn=data_cfg.all_list_fn candidate_list=gen_utils.read_lines_from_text_file(data_cfg.cad_lst.fn) pos_tagger = stanford.StanfordPOSTagger(os.path.join(data_cfg.root_path,"english-bidirectional-distsim.tagger")) def tag_pos(qid): qtokens = gen_utils.read_dict_from_pkl(os.path.join(data_cfg.root_path,qid & "_tokens.pkl")) qpos = pos_tagger.tag(qtokens) pos_list = [] for q in qpos: tok, pos = q pos_list.append(pos) pos_fn=os.path.join(feat_root_path,str(qid)+".pkl") gen_utils.write_dict_to_pkl(pos_list,pos_fn) return if __name__ == '__main__':
'STANFORD_MODELS'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\data;C:\\Users\\lenovo\\Documents\\salm\\stanford-postagger-full-2018-10-16\\models' os.environ[ 'STANFORD_PARSER'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-parser-full-2018-10-17' os.environ[ 'CLASSPATH'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-parser-full-2018-10-17' os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk-14.0.2_windows-x64_bin.exe' segmenter = StanfordSegmenter( 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\stanford-segmenter-3.9.2.jar' ) segmenter.default_config('ar') text = segmenter.segment_file('text file') print(text) tagger = STag.StanfordPOSTagger( 'arabic.tagger', 'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-postagger-full-2018-10-16\\stanford-postagger.jar' ) for tag in tagger.tag(text.split()): print(tag[1]) parser = SParse.StanfordParser( model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz') sentences = parser.raw_parse_sents(text.split('.')) for line in sentences: for sentence in line: print(sentence) sentence.draw() from polyglot.text import Text text = Text(text) for sent in text.sentences:
def extract_component(f, keywords): # Apply each function name = f.name fe = FeaturesExtractor(f, keywords) data = fe.extract_features() # Write to file new_file = open("features/" + name[6:], "w") new_file.write('\n'.join([str(e) for e in data])) def extract_keywords(tagger): left, right = left_vs_right_keywords(tagger) left = [e[0] for e in left if len(e[0]) > 3] right = [e[0] for e in right if len(e[0]) > 3] left = left[:50] right = right[:50] return list(set().union(left, right)) if __name__ == "__main__": tagger = st.StanfordPOSTagger( "stanford-postagger-full-2014-08-27/models/french.tagger", "stanford-postagger-full-2014-08-27/stanford-postagger.jar") files = glob.glob("train/*") keywords = extract_keywords(tagger) print(keywords) for i in range(len(files)): print("{} / {}...".format(i + 1, len(files))) extract_component(open(files[i], "r"), keywords)
from sklearn.feature_selection import mutual_info_classif from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import confusion_matrix import numpy from sklearn import svm import sys from nltk.tag import stanford #from nltk.tag.corenlp import CoreNLPPOSTagger # this is all done a bit haphazardly, seeing if using postagbigrams improves performance. If it does, do this properly, probably by taking the parse trees that I already generated for connective identification. I'm hoping/assuming here that the parser will assign the same/very similar postags as the pos tagger does. # Add the jar and model via their path (instead of setting environment variables): jar = '/home/peter/phd/PhdPlayground/stanfordStuff/stanford-postagger-full-2018-02-27/stanford-postagger.jar' model = '/home/peter/phd/PhdPlayground/stanfordStuff/stanford-postagger-full-2018-02-27/models/german-fast.tagger' pos_tagger = stanford.StanfordPOSTagger(model, jar, encoding='utf8') #headers = ['id', 'connective', 'pos', 'sentencePosition', 'pathToRoot', 'class_label' """ pos2column = { 0:'connective', 1:'pos', 2:'sentencePosition', 3:'pathToRoot', 4:'class_label' } """ pos2column = {}