def fit_on_data(dir_data=DIR_DATA, name_file=NAME_FILE): ''' fit the model on given file with annotation ''' corpus = RepoModel(dir_data) # load corpus doc = corpus.documents[name_file] # get document with key bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True) # bert model as service words, wordsvec, spans, wordslabel = words_vec_label(doc, bc) # wordsvec from list to array wordsvec = np.asarray(wordsvec) # label encoder wordslabel = [label[0] for label in wordslabel] # encode class values as integers encoder = LabelEncoder() encoder.fit(wordslabel) Y_encoder = encoder.transform(wordslabel) # convert integers to dummy variables (i.e. one hot encoded) Y_encoder = np_utils.to_categorical(Y_encoder) #X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0) X_train, X_test, Y_train, Y_test = wordsvec, wordsvec, Y_encoder, Y_encoder # model define N_batch = 4 N_epoch = 4 en_verbose = 1 input_dim = wordsvec.shape[1] N_classes = len(set(wordslabel)) model = create_base_network(X_train[0].shape[0], len(np.unique(wordslabel))) model.summary() # model training start = time.time() history = model.fit(X_train, Y_train, batch_size=N_batch, epochs=N_epoch, verbose=en_verbose, validation_data=(X_test, Y_test)) end = time.time() print('time elapse training:\t', end - start, 'sec') return model
def main(data_dir, model_dir=None, exclude_normalize_tags=None, keys={}): ''' data_dir -> path to brat annotation data. searches recursively model_dir -> path to save spacy training model exclude_normalize_tags -> list of tags to exclude from normalization. If NONE, no normalization is performed. keys -> dict translating brat tags to training tags. keys not in dict will be preserved ''' r = RepoModel(data_dir, recursive=True, cached=False) nlp = spacy.load('en_default') # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) normalized_train_data = [] excludes = exclude_normalize_tags #we have manually tagged all instances of these for key, data in r.documents.items(): if exclude_normalize_tags: normalized_train_data.extend( normalize_tags(nlp, get_annotated_sents(data, keys), excludes)) else: normalized_train_data.extend(get_annotated_sents(data, keys)) # print(normalized_train_data) nlp = train_ner(nlp, normalized_train_data, keys.values()) doc = nlp( u"Hi Adam,\nSounds great to me. I'll send through the QA department. In the invite you through Skype, and we can discuss if Applause is right for you.\nI look forward to it!\nRegards,\nAndrew" ) for word in doc: print(word.text, word.tag_, word.ent_type_) if model_dir is not None: save_model(nlp, model_dir)
def convert(brat_dir_path: str, output_dir_path: str): # load the brat repository repo = RepoModel(brat_dir_path) print('Loaded {} document(s) from {}'.format(len(repo.documents), brat_dir_path)) for document_name in repo.documents: document = repo.documents[document_name] converter = DocConverter(document) sentences = converter.sentences with open( os.path.join(output_dir_path, '{}.json'.format(document_name)), 'x') as output_file: json.dump(list(map(lambda s: s.to_dict(), sentences)), output_file, indent=2)
def convert(brat_dir_path: str, output_file_path: str, verbose: bool = False): # load the brat repository repo = RepoModel(brat_dir_path) if verbose: print('Loaded {} document(s) from {}'.format(len(repo.documents), brat_dir_path)) # load the SLING commons store and the document schema commons = load_commons_store() schema = sling.DocumentSchema(commons) commons.freeze() writer = sling.RecordWriter(output_file_path) for document_name in repo.documents: document = repo.documents[document_name] reader = DocReader(document) converter = DocConverter(commons, schema, document_name) converter.convert(reader, writer) writer.close()
# print("sent:", sent) for token in word_tokenize(sent): # print("token:", token) ## if len(token) > 1 and token not in contractions and not re.search("^[\W\d]+$", token) and token not in set(methods+strings+comments+operands+operators+variables+URLs): if not re.search("^[\W\d]+$", token) and token not in set( methods + strings + comments + operands + operators + variables + URLs): mywords.append(token) alltokens = set(methods + strings + comments + operands + operators + mywords + variables + URLs) post["words"] = pythontagger.tag(list(alltokens)) # -------- r = RepoModel("../annotations") # load repomodel r.documents # all documents in your brat corpus filename = "9" doc = r.documents[filename] # get document with key 001 # print(doc.sentences) # a list of sentences in document # print(doc.annotations) # the annotation objects in a document # for word in doc.annotations: # print(word.repr, word.labels) for filename, post in posts.items(): doc = r.documents[filename] words = [] for word in doc.annotations: words.append((word.repr, word.labels))
ANN_FILEs.append(file_name[:-4]) DIR_MODEL = './save/' file_model_trig = DIR_MODEL + TASK_NAME + '_model_trigger.pkl' file_model_arg = DIR_MODEL + TASK_NAME + '_model_arg.pkl' bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=False) # bert model as service triggers, vec_trig, label_trig, args, vec_arg, label_arg = [], [], [], [], [], [] try: triggers, vec_trig, label_trig, args, vec_arg, label_arg = joblib.load( NAME_DATA_FILE) args, vec_arg, label_arg = None, None, None except: corpus = RepoModel(DIR_DATA) # load corpus for ANN_FILE in ANN_FILEs: doc = corpus.documents[ANN_FILE] # get document with key ttriggers, tvec_trig, tlabel_trig, targs, tvec_arg, tlabel_arg, tlabel_arg_for_each_trig = get_events_in_mention( doc, bc) triggers.extend(ttriggers) vec_trig.extend(tvec_trig) label_trig.extend(tlabel_trig) args.extend(targs) vec_arg.extend(tvec_arg) label_arg.extend(tlabel_arg) print('trigs:', len(vec_trig), 'args:', len(vec_arg)) joblib.dump([triggers, vec_trig, label_trig, args, vec_arg, label_arg], NAME_DATA_FILE) args, vec_arg, label_arg = None, None, None
def corpus_save(dir_data, out_data): corpus = RepoModel(dir_data) corpus.save_xml(out_data)
def training(DIR_DATA): print('\ndata importing:') TASK_NAME = DIR_DATA NAME_DATA_FILE = TASK_NAME + '_data_import' + '.save' # obtain all the files list ANN_FILEs = [] DIR_ALL_FILES = os.listdir(DIR_DATA) for file_name in DIR_ALL_FILES: if file_name.split('.')[-1] == 'txt': ANN_FILEs.append(file_name[:-4]) DIR_MODEL = './save_Eng/' file_model_trig = DIR_MODEL + TASK_NAME + '_model_trigger.pkl' file_model_arg = DIR_MODEL + TASK_NAME + '_model_arg.pkl' triggers, vec_trig, label_trig, args, vec_arg, label_arg = [], [], [], [], [], [] try: triggers, vec_trig, label_trig, args, vec_arg, label_arg = joblib.load( NAME_DATA_FILE) args, vec_arg, label_arg = None, None, None except: corpus = RepoModel(DIR_DATA) # load corpus for ANN_FILE in ANN_FILEs: bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=False) # bert model as service doc = corpus.documents[ANN_FILE] # get document with key ttriggers, tvec_trig, tlabel_trig, targs, tvec_arg, tlabel_arg, tlabel_arg_for_each_trig = get_events_in_mention( doc, bc) triggers.extend(ttriggers) vec_trig.extend(tvec_trig) label_trig.extend(tlabel_trig) args.extend(targs) vec_arg.extend(tvec_arg) label_arg.extend(tlabel_arg) print('trigs:', len(vec_trig), 'args:', len(vec_arg)) joblib.dump([triggers, vec_trig, label_trig, args, vec_arg, label_arg], NAME_DATA_FILE) args, vec_arg, label_arg = None, None, None print('=' * 65, '\n>>trigger model training:') try: model_trig, encoder_trig = joblib.load(file_model_trig) acc_pre = test_on_data(model_trig, encoder_trig, vec_trig, label_trig, en_verbose=0) except: # model define input_dim = np.asarray(vec_trig).shape[1] N_classes = len(set(label_trig)) model_trig = create_base_network(input_dim, N_classes) encoder_trig = LabelEncoder() encoder_trig.fit(label_trig) acc_pre = 0 N_batchs = [ len(label_trig), 8192, 4096, 2048, 1024, 512, 32, 16, 8, 4, 2, 1 ] lrs = [0.001, 0.00001] for N_batch in N_batchs: for lr in lrs: Times_training, N_batch, N_epoch, en_verbose = 3, N_batch, max( 16, int(np.floor(np.sqrt(10 * N_batch)))), 1 for times in range(1, Times_training): the_lr = lr / times model_trig, encoder_trig, his = fit_on_data( vec_trig, label_trig, model_trig, encoder_trig, the_lr, N_batch=N_batch, N_epoch=N_epoch, en_verbose=en_verbose) print('acc:{}'.format(his.history['acc'][-1])) val_acc = test_on_data(model_trig, encoder_trig, vec_trig, label_trig, en_verbose=en_verbose) joblib.dump([model_trig, encoder_trig], '{}_{:.5f}_{:.5f}_{:.5f}_{:.5f}.pkl'.format( file_model_trig[0:-4], his.history['acc'][-1], val_acc, the_lr, N_batch)) # save the model to disk if val_acc > acc_pre: acc_pre = val_acc joblib.dump([model_trig, encoder_trig], '{}.pkl'.format( file_model_trig[0:-4])) # save the model to disk else: break return print('=' * 65, '\n>>argument model training:') try: triggers, vec_trig, label_trig = None, None, None triggers, vec_trig, label_trig, args, vec_arg, label_arg = joblib.load( NAME_DATA_FILE) triggers, vec_trig, label_trig = None, None, None model_arg, encoder_arg = joblib.load(file_model_arg) acc_pre = test_on_data(model_arg, encoder_arg, vec_arg, label_arg, en_verbose=0) except: encoder_arg = LabelEncoder() encoder_arg.fit(label_arg) # model define input_dim = np.asarray(vec_arg).shape[1] N_classes = len(set(label_arg)) model_arg = create_base_network(input_dim, N_classes) acc_pre = 0 for lr in lrs: for N_batch in N_batchs: Times_training, N_batch, N_epoch, en_verbose = 3, N_batch, max( 16, int(np.floor(np.sqrt(10 * N_batch)))), 1 for times in range(1, Times_training): the_lr = lr / times model_arg, encoder_arg, his = fit_on_data( vec_arg, label_arg, model_arg, encoder_arg, the_lr, N_batch=N_batch, N_epoch=N_epoch, en_verbose=en_verbose) print('acc:{}'.format(his.history['acc'][-1])) val_acc = test_on_data(model_arg, encoder_arg, vec_arg, label_arg, en_verbose=en_verbose) joblib.dump([model_arg, encoder_arg], '{}_{:.5f}_{:.5f}_{:.5f}_{:.5f}.pkl'.format( file_model_arg[0:-4], his.history['acc'][-1], val_acc, the_lr, N_batch)) # save the model to disk if val_acc > acc_pre: acc_pre = val_acc joblib.dump([model_arg, encoder_arg], '{}.pkl'.format( file_model_arg[0:-4])) # save the model to disk else: break
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: (filepath, tempfilename) = os.path.split(txt_file_path) (filename, extension) = os.path.splitext(tempfilename) r = RepoModel(filepath) r.save_xml(filepath) # xml_save(filepath, filename, filename) xml_file_path = os.path.join(filepath, filename+'.xml') # print("xml_file_path::::", r, file=sys.stderr) # if xml_file_path: # pass # else: # xml_save(filepath, filename, filename) with open(xml_file_path, 'r') as xml_file: xml = xml_file.read() with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() j_dic['xml'] = xml except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
from pathlib import Path import nltk from data_collector import BratDataCollector from bratreader.repomodel import RepoModel from classification import Classification from deeppavlov import build_model, configs import operator # will this path to brat repository be the field of UI? brat_folder = Path('D:\\Диплом\\prog\\essays\\original') # brat_folder = Path('D:\\Диплом\\prog\\essays\\russian') brat_reader = RepoModel(brat_folder) collector = BratDataCollector(brat_reader) data = collector.collect_data() def get_sentiment_statistic(data, language, deeppavlov_model): from data_manager import DataManager correct_labels = ['Premise', 'Claim'] ru_sentiment = ['positive', 'neutral', 'negative'] en_sentiment = ['Positive', 'Neutral', 'Negative'] cur_sentiment = [] if language == 'ru': cur_sentiment = ru_sentiment if language == 'en': cur_sentiment = en_sentiment args = DataManager().filter_labels(data, correct_labels) all_premises = 0
import classifier as classifier import nltk import bratreader import Splitter import POSTagger from nltk.corpus import movie_reviews, LazyCorpusLoader, CategorizedPlaintextCorpusReader from DictionaryTagger import DictionaryTagger from bratreader.repomodel import RepoModel reader = RepoModel("bratessays") # load repomodel reader.documents doc = reader.documents["essay01"] # get document with key 001 print(doc.sentences) # a list of sentences in document print(doc.annotations) # the annotation objects in a document text = """It is always said that competition can effectively promote the development of economy. In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers. However, when we discuss the issue of competition or cooperation, what we are concerned about is not the whole society, but the development of an individual's whole life. From this point of view, I firmly believe that we should attach more importance to cooperation during primary education. First of all, through cooperation, children can learn about interpersonal skills which are significant in the future life of all students. What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others. During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred. All of these skills help them to get on well with other people and will benefit them for the whole life. On the other hand, the significance of competition is that how to become more excellence to gain the victory. Hence it is always said that competition makes the society more effective. However, when we consider about the question that how to win the game, we always find that we need the cooperation. The greater our goal is, the more competition we need. Take Olympic games which is a form of competition for instance, it is hard to imagine how an athlete could win the game without the training of his or her coach, and the help of other professional staffs such as the people who take care of his diet, and those who are in charge of the medical care. The winner is the athlete but the success belongs to the whole team. Therefore without the cooperation, there would be no victory of competition. Consequently, no matter from the view of individual development or the relationship between competition and cooperation we can receive the same conclusion that a more cooperative attitudes towards life is more profitable in one's success.""" """ splitter = Splitter.Splitter() postagger = POSTagger.POSTagger() splitted_sentences = splitter.split(text) print(splitted_sentences) pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
class DataProcess(object): reader = RepoModel(bratessayFolder) # load repomodel def __init__(self): pass def ProcessData(self): for i in range(1, 80): doc = reader.documents["essay" + str(i)] dataObjectList = [] annotatedData = set(doc.annotations) for annotation in annotatedData: # print("annotation :", annotation.repr) # print("labels :", annotation.labels.items()) # print("links :", annotation.links) # print("********************************************************************************") dataObject = {"annotation": annotation.repr, "labels": annotation.labels.items(), "links": annotation.links} dataObjectList.append(dataObject) data = self.ExtractDataFeatures(dataObjectList, doc.key) return data def ExtractDataFeatures(self,data,key): full = {} wholeobject = [] labs = [] links = [] for items in data: sentence = items['annotation'] full['sentence'] = sentence for label in items['labels']: for lab in label: if lab != None and lab != []: labs.append(lab) full['label'] = labs labs = [] for link in items['links'].iteritems(): lnk = link[0] for x in link[1]: linkk = {lnk:x.repr} links.append(linkk) full['links'] = links links = [] wholeobject.append(full) full = {} completeset.append({key:wholeobject}) return completeset def ClassifyArguments(self,dataset): ArgumentComponent = [] for essay in dataset: for item in essay.values(): for x in item: for label in x['label']: if label == 'Claim' or label == 'MajorClaim' or label == 'Premise': filteredObj = (label, x['sentence']) ArgumentComponent.append(filteredObj) return ArgumentComponent def ClassifyLinks(self,data): Links = [] for essay in data: for item in essay.values(): for x in item: for link in x: if link == 'links': for stance in x['links']: for y in stance.items(): filteredObj = (y) Links.append(filteredObj) return Links def getFilteredWords(self,Components,links = None): # provide supporting and attacking argumetns with links if required sentences = [] for (sentiment,words) in Components: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] sentences.append((words_filtered,sentiment)) return sentences def get_words_in_doc(self,sentences): all_words = [] for (words, sentiment) in sentences: all_words.extend(words) return all_words def get_word_features(self,wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features #--------------------------------------------Test data extraction --------------------------------------# def getTestData(self,key): doc = reader.documents[key] dataObjectList = [] annotatedData = set(doc.annotations) for annotation in annotatedData: # print("annotation :", annotation.repr) # print("labels :", annotation.labels.items()) # print("links :", annotation.links) # print("********************************************************************************") dataObject = {"annotation": annotation.repr, "labels": annotation.labels.items(), "links": annotation.links} dataObjectList.append(dataObject) data = self.ExtractTestSentences(dataObjectList) return data def ExtractTestSentences(self,data): sentences = [] for items in data: sentence = items['annotation'] sentences.append(sentence) return sentences def getTestAccuracyData(self): classification = Classification.Classification() for i in range(80, 90): doc = reader.documents["essay" + str(i)] dataObjectList = [] annotatedData = set(doc.annotations) for annotation in annotatedData: dataObject = {"annotation": annotation.repr, "labels": annotation.labels.items(), "links": annotation.links} dataObjectList.append(dataObject) data = self.ExtractDataFeatures(dataObjectList, doc.key) preTrainingData = classification.prepareTrainingData(data) # arguments and links Arguments = preTrainingData[0] Links = preTrainingData[1] Arg_word_features = classification.getWordFeatures(Arguments) Link_word_features = classification.getWordFeatures(Links) classification.setWordfeatureSet(Arg_word_features) ArgumentTesting_set = nltk.classify.apply_features(classification.extract_features, Arguments) classification.setWordfeatureSet(Link_word_features) LinksTesting_set = nltk.classify.apply_features(classification.extract_features, Links) return [ArgumentTesting_set,LinksTesting_set] #------------------------------------------utilities -------------------------------------# def getPathToFile(self,RelativePath): dir = os.getcwd() ROOT_DIR = os.path.dirname(os.path.abspath(dir)) folder = os.path.join(ROOT_DIR, RelativePath) return folder
import nltk import os from bratreader.repomodel import RepoModel from Utils import Utils import Classification bratessayFolder = Utils().getPathToFile('bratessays') reader = RepoModel(bratessayFolder) #doc = reader.documents["essay01"] # get document with key 001 #print("sentences",doc.sentences) # a list of sentences in document #print("annotation :",doc.annotations) # the annotation objects in a documennt completeset = [] class DataProcess(object): reader = RepoModel(bratessayFolder) # load repomodel def __init__(self): pass def ProcessData(self): for i in range(1, 80): doc = reader.documents["essay" + str(i)] dataObjectList = [] annotatedData = set(doc.annotations) for annotation in annotatedData: # print("annotation :", annotation.repr) # print("labels :", annotation.labels.items())
#test # TEST_DATA = ('data/test/') #TEST_DATA = ('data_chinese/test/') TEST_DATA = ('/home/linbo/Downloads/Annotation/military-corpus/') TEST_FILEs = [] TEST_ALL_FILES = os.listdir(TEST_DATA) for test_file_name in TEST_ALL_FILES: if test_file_name.split('.')[-1] == 'txt': TEST_FILEs.append(test_file_name[:-4]) # print(TEST_FILEs) test_triggers, test_vec_trig, test_label_trig, test_args, test_vec_arg, test_label_arg = [], [], [], [], [], [] test_text = [] test_line = [] test_label_arg_for_each_trig = [] test_corpus = RepoModel(TEST_DATA) # load corpus for TEST_FILE in TEST_FILEs: test_doc = test_corpus.documents[TEST_FILE] # get document with key test_ttriggers, test_tvec_trig, test_tlabel_trig, test_targs, test_tvec_arg, test_tlabel_arg, test_tlabel_arg_for_each_trig = get_events_in_mention( test_doc, bc) test_triggers.append(test_ttriggers) test_vec_trig.append(test_tvec_trig) test_label_trig.append(test_tlabel_trig) test_args.append(test_targs) test_vec_arg.append(test_tvec_arg) test_label_arg.append(test_tlabel_arg) test_label_arg_for_each_trig.append(test_tlabel_arg_for_each_trig) test_text.append(test_doc.text) for sent in test_doc.sentences:
def __init__(self): self.LinkType = "" self.DrugRepr = "" self.OtherRepr = "" self.textBetween = "" self.text_before = "" self.text_after = "" self.drug_start = 0 self.drug_end = 0 self.other_start = 0 self.other_end = 0 self.isPositive = False source = sys.argv[1] r = RepoModel(source) # print(r.documents ) myfile = open("DrugInteractionCSV15.arff", 'w') # wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) # wr.writerow(["LinkType","DrugRepr","OtherRepr","BetweenLength","NumTokensBetween","IsPositive"]) myfile.write('''@RELATION wordcounts @ATTRIBUTE LinkType string @ATTRIBUTE DrugRepr string @ATTRIBUTE OtherRepr string @ATTRIBUTE BetweenLength numeric @ATTRIBUTE NumTokensBetween numeric @ATTRIBUTE betweenText string @ATTRIBUTE isPositive {True,False} @DATA
def __init__(self, data_dir): self.data_dir = data_dir self.corpus = RepoModel(data_dir) self.documents = self.corpus.documents self.sentences = self.load_sentences()