def read_conllxi(path): lines = file_util.read_line_list(path, f_encoding='UTF-8') sentences = [] temp = [] for line in lines: if line == "": sentences.append(sentence.Sentence(temp)) temp = [] else: temp.append(line) if len(temp) != 0: sentences.append(sentence.Sentence(temp)) return sentences
def process_srt(self,filename): with open(filename, mode='r') as file_srt: subs = file_srt.read() subs_buffer = subs.split('\n') for idx, val in enumerate(subs_buffer): if val.isdigit(): time_label = subs_buffer[idx+1].split(" --> ") time_label_start = time_label[0].split(',')[0] time_label_start_remainder = time_label[0].split(',')[1] time_label_end = time_label[1].split(',')[0] time_label_end_remainder = time_label[1].split(',')[1] sub_text = subs_buffer[idx+2] sub_text = self.sentence_cleaning(sub_text) if sub_text !="" and idx+3<len(subs_buffer) : # print len(subs_buffer) # print idx if subs_buffer[idx+3]!="" : sub_text=sub_text+" "+subs_buffer[idx+3] idx=idx+5 else: idx=idx+4 new_sentence = sentence.Sentence(time_label_start,int(time_label_start_remainder), time_label_end,int(time_label_end_remainder),sub_text) if len(self.sentences) > 0 and self.sentences[-1].time_end != new_sentence.time_start: self.sentences[-1].time_end = new_sentence.time_start self.sentences[-1].sample_end = new_sentence.sample_start self.sentences.append(new_sentence)
def create_sentences(self, sentence_tuple_list): c = self.__get_cursor() c.executemany( "INSERT INTO " + self.sentence_table_name + " VALUES (?,?,?,?)", sentence_tuple_list) self.db.commit() return [sentence.Sentence(*x) for x in sentence_tuple_list]
def keepForDiabetesCorpus(xmldoc): """ Return True if we should keep this abstract for the diabetes corpus Include abstract in diabetes corpus if it contains at least one cost value or term. """ abstractNodes = xmldoc.getElementsByTagName('Abstract') if abstractNodes is None or len(abstractNodes) == 0: return False textNodeList = abstractNodes[0].getElementsByTagName('AbstractText') if textNodeList is None or len(textNodeList) == 0: return False nCostValues = 0 nCostTerms = 0 tokenCount = 0 cueLemmaSet = {"cost", "QALY", "QALYs"} for textNode in textNodeList: text = xmlutil.getText(textNode) sentenceList = sentenceSplitter.tokenize(text) for sText in sentenceList: tokenTextList = tokenizer.tokenize(sText) tokenList = tokenlist.TokenList() tokenList.convertStringList(tokenTextList) s = sentence.Sentence(tokenList) for token in s: tokenCount += 1 lemmatizeabstracts.lemmatizeToken(token) if token.lemma in cueLemmaSet or token.text.find('cost') >= 0: nCostTerms += 1 if cvFinder.tokenIsCostValue(token): nCostValues += 1 return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
def get_sentence(self, content_id, language, sentence_number): c = self.__get_cursor() c.execute( "SELECT * FROM " + self.sentence_table_name + " WHERE content_id=? and language=? and sentence_number=?", (content_id, language, sentence_number)) sentence_data = c.fetchone() if sentence_data: return sentence.Sentence(*sentence_data) else: return None
def unplagarize(self): for x in xrange(0, len(self.sentences) / 2, 2): self.final_sentences.append( sentence.Sentence(self.sentences[x], self.sentences[x + 1]).unplagarize) print self.sentences print self.final_sentences full_paragraph = self.final_sentences[0] for x in xrange(1, len(self.final_sentences)): full_paragraph = full_paragraph + " " + self.final_sentences[x] return full_paragraph
def __init__(self, treebank=None): treebank = self._get_treebank(treebank) S, Gold = [], [] for t in treebank.get_trees(): s = sentence.Sentence(t.leaves()) S += [s] # Gold += [depset.deptree_to_depset(t)] Gold += [t.depset] self.S = S self.Gold = Gold
def main(): """ baseline system which tag only with gazette """ dic, max_key_len = gazette.load(_OPTS.gazette) json_obj = json.load(sys.stdin) for sent_obj in json_obj['sentence']: sent = sentence.Sentence(sent_obj) sent.tag_nes(dic, max_key_len) filtered = [_.json_obj for _ in _filter_dic_nes(sent.dic_nes)] sent_obj['NE'] = filtered json.dump(json_obj, sys.stdout, ensure_ascii=False, indent=2)
def main(): """ convert from JSON to CRFsuite feature format """ dic, max_key_len = gazette.load(_OPTS.gazette) json_obj = json.load(sys.stdin) for sent_obj in json_obj['sentence']: sent = sentence.Sentence(sent_obj) sent.tag_nes(dic, max_key_len) for morp in sent.morps: features = feature.get_all_feat(sent, morp.id()) print('%s\t%s' % (sent.label(morp.id()), '\t'.join(features))) print()
def make_sentences(self): '''Make a sentence from all consecutive words from 1 speaker between eol (. ! ?).''' if verbose: print('creating sentences, words between eols') self.sentences = [] sentence_wl = [] sentence_index = 0 for w in self.words: sentence_wl.append(w) if w.eol: self.sentences.append(sentence.Sentence(sentence_wl,sentence_index)) if self.corpus == 'CGN': self.sentences[-1].overlap_unknown = False sentence_index += 1 sentence_wl = [] self.nsentences = len(self.sentences)
def main(w2v_path, model_path): """ tag person(PS) with SVM classifier :param w2v_path: word2vec file path :param model_path: model path """ w2v_dic = word2vec.load(w2v_path) svm_model = cPickle.load(open(model_path, 'rb')) json_obj = json.load(sys.stdin) for sent_obj in json_obj['sentence']: sent = sentence.Sentence(sent_obj) ps_nes = _tag_ps(w2v_dic, svm_model, sent) sent_obj['NE'] = _merge_ne(sent, ps_nes) json.dump(json_obj, sys.stdout, ensure_ascii=False, indent=2)
def _count(gold, test): """ count gold, test and matched NEs :param gold: gold standard :param test: test :return: (gold, test, match) counter triple """ gold_cnt = defaultdict(int) test_cnt = defaultdict(int) match_cnt = defaultdict(int) for gold_sent, test_sent in zip(gold['sentence'], test['sentence']): if len(gold_sent['text']) != len(test_sent['text']): logging.error('content of sentences are different:') logging.error('\tgold: %s', gold_sent['text']) logging.error('\ttest: %s', test_sent['text']) sys.exit(2) gold_nes = set( [NE(_['begin'], _['end'], _['type']) for _ in gold_sent['NE']]) # gold_cnt.update([_.cate for _ in gold_nes]) # Counter only in 2.7 for entity in gold_nes: gold_cnt[entity.cate] += 1 test_nes = set( [NE(_['begin'], _['end'], _['type']) for _ in test_sent['NE']]) # test_cnt.update([_.cate for _ in test_nes]) # Counter only in 2.7 for entity in test_nes: test_cnt[entity.cate] += 1 match_nes = gold_nes & test_nes # match_cnt.update([_.cate for _ in match_nes]) # Counter only in 2.7 for entity in match_nes: match_cnt[entity.cate] += 1 if ERR_CATE: gold_only_nes = set( [_ for _ in (gold_nes - match_nes) if _.cate in ERR_CATE]) test_only_nes = set( [_ for _ in (test_nes - match_nes) if _.cate in ERR_CATE]) if gold_only_nes or test_only_nes: sent = sentence.Sentence(gold_sent) print(sent.to_dbg_str(), file=sys.stderr) for ett in sorted(list(gold_only_nes)): print('\t[G] (%s) %s' % (ett.cate, _morp_dbg_str(sent, ett.begin, ett.end)), file=sys.stderr) for ett in sorted(list(test_only_nes)): print('\t[T] (%s) %s' % (ett.cate, _morp_dbg_str(sent, ett.begin, ett.end)), file=sys.stderr) return gold_cnt, test_cnt, match_cnt
def get_sentences(paragraphs): """Get tokenized sentences within each paragraph from a list of paragraphs where each paragraph is a string or a list of sentences. """ # Note that this generator yields # paragraph = [sent1, sent2, ...] and sent = [token1, token2, ...] offset = 0 for p, paragraph in enumerate(paragraphs): sents = split(paragraph) if isinstance(paragraph, str) \ else paragraph yield [ sentence.Sentence(raw=sent, sentid=(offset + s), rel_id=s, par_id=p) for s, sent in enumerate(sents) ] offset += len(sents)
def __check_top_results(masked_sentence, mistake_position, ngrams, top_results): i = 0 while i != len(top_results["token_strs"]): suggestion_sentence = masked_sentence.replace( "[MASK]", top_results["token_strs"][i]) suggestion_ngrams = ngram_model.Ngrams( sentence.Sentence(suggestion_sentence), ngrams.language) suggestion_mistake_positions = error_checker.get_mistake_positons( suggestion_ngrams) if mistake_position in suggestion_mistake_positions: del top_results["token_strs"][i] del top_results["scores"][i] del top_results["ranks"][i] else: i += 1 return top_results
def __init__(self, treebank=None, training_corpus=None): treebank = self._get_treebank(treebank) if training_corpus == None: training_corpus = treebank self.training_corpus = training_corpus S, Gold = [], [] #for s in treebank.sents(): for s in treebank.tagged_sents(): s = [x[1] for x in s] S += [sentence.Sentence(s)] for t in treebank.parsed_sents(): Gold += [bracketing.tree_to_bracketing(t)] self.S = S self.Gold = Gold
def main(): """ convert from CRFsuite IOB tagged to JSON """ json_obj = json.load(codecs.open(_OPTS.json, 'rt', encoding='UTF-8')) iobs = _load_iob_sentences(sys.stdin) if len(json_obj['sentence']) != len(iobs): logging.error('# of sentences are different %d vs %d', len(json_obj['sentence']), len(iobs)) sys.exit(1) for sent_obj, iob in zip(json_obj['sentence'], iobs): sent = sentence.Sentence(sent_obj) if len(sent.morps) != len(iob): logging.error('morpheme lengths in sentence are different:') logging.error('\tjson: %s', len(sent.morps)) logging.error('\tiob : %s', len(iob)) sys.exit(2) sent_obj['NE'] = _make_nes(sent, iob) json.dump(json_obj, sys.stdout, ensure_ascii=False, indent=2)
def keepForDiabetesCorpusCostValue(xmldoc): """ Return True if we should keep this abstract for the diabetes corpus Include abstract in diabetes corpus if it contains at least *one* currency value. """ textNodeList = xmldoc.getElementsByTagName('AbstractText') nCostValues = 0 for textNode in textNodeList: text = xmlutil.getText(textNode) sentenceList = sentenceSplitter.tokenize(text) for sText in sentenceList: tokenTextList = tokenizer.tokenize(sText) tokenList = tokenlist.TokenList() tokenList.convertStringList(tokenTextList) s = sentence.Sentence(tokenList) for token in s: lemmatizeabstracts.lemmatizeToken(token) if cvFinder.tokenIsCostValue(token): nCostValues += 1 return nCostValues > 0
def __init__(self, treebank=None, training_corpus=None): """ The elements of the treebank must be trees with a DepSet in the attribute depset. """ treebank = self._get_treebank(treebank) if training_corpus is None: training_corpus = treebank self.test_corpus = treebank self.training_corpus = training_corpus S = [] for s in treebank.tagged_sents(): s = [x[1] for x in s] S += [sentence.Sentence(s)] self.S = S # Extract gold as DepSets: # FIXME: call super and do this there. self.Gold = [t.depset for t in treebank.parsed_sents()] # Extract gold as Bracketings: self.bracketing_model = model.BracketingModel(treebank)
def test(self, sentences): """ Inference on test sentences input: test sentences """ test_sentences = [] inferred_trees = [] total = 0 for s in sentences: test_sentences += [sentence.Sentence(s.words, s.pos_tags)] for s in test_sentences: trees = s.get_trees() i = 0 no_construction = False while (len(trees) > 0): if i == (len(trees) - 1): if no_construction == True: break # if we reach the end start from the beginning no_construction = True i = 0 else: # extract features extracted_features = self.extract_test_features( trees, i, LEFT_CONTEXT, RIGHT_CONTEXT, self.N_FEATURES) # estimate the action to be taken for i, i+ 1 target nodes y = self.estimate_action(trees, i, extracted_features) i, trees = self.take_action(trees, i, y) # execute the action and modify the trees if y != SHIFT: no_construction = False self.test_actions[y] += 1 if (len(trees) == 1): total += 1 # print total inferred_trees += [trees] return inferred_trees
def _process_sentences(self, sentences): """Generate a list of Sentence or MultiSentence objects from a variety of inputs. """ if len(sentences) == 0: self.sentences = sentences return # NOTE: We base all checks on the first tuple in order to ensure # consistent (and more efficient) processing. # If we already have Sentence-like objects, do nothing if isinstance(sentences[0], sentence.Sentence) or \ isinstance(sentences[0], sentence.MultiSentence): self.sentences = sentences # If we have a list of strings, assume that each element represents a # separate sentence (we expect a list of sentences as input) elif isinstance(sentences[0], basestring): self.sentences = [sentence.Sentence(s) for s in sentences] # If we have a list of lists/tuples, look deeper elif isinstance(sentences[0], list) or isinstance(sentences[0], tuple): # If it's a list/tuple of strings, check the last string. if isinstance(sentences[0][0], basestring): # If the last string is sentence-terminating punctuation # or doesn't feature a space, assume each string represents # a word, and therefore each list/tuple represents a # single sentence. if sentences[0][-1] in ('.', '?', '!', '\"', '\'') or \ (len(sentences[0]) > 1 and \ ' ' not in sentences[0][-1]): self.sentences = [sentence.Sentence(s) for s in sentences] # Otherwise, assume that each string represents a full # sentence and therefore each list/tuple represents a group of # multiple connected sentences. else: self.sentences = [ sentence.MultiSentence(map(sentence.Sentence, ms)) for ms in sentences ] # If it's a list/tuple of lists/tuples, just assume that they each # contain strings representing words. The inner lists should # represent sentences while the outer lists should represent # groups of multiple connected sentences. elif isinstance(sentences[0][0], list) or \ isinstance(sentences[0][0], tuple): self.sentences = [ sentence.MultiSentence(map(sentence.Sentence, ms)) for ms in sentences ] else: print "ERROR: unknown type", str(sentences[0].__class__) print "Expected Sentence-like objects or lists of strings", print "convertible to Sentence-like objects" raise TypeError else: print "ERROR: unknown type", str(sentences[0].__class__) print "Expected Sentence-like objects or lists of strings", print "convertible to Sentence-like objects" raise TypeError
"""some""" import sentence sen = sentence.Sentence("Hello world.") print("Lazy iterator") print(sen._words()) print(next(sen._words())) print("\nFor loop:") for i in sen: print(i) print("\nSentence.words: ") print(sen.words) print("\nSentence.chars_count: ") print(sen.chars_count) print("\nSentence.other_chars: ") print(sen.other_chars) print("\nSentence[0]: ") print(sen[0]) print("\nSentence[:]: ") print(sen[:]) gen = iter(sen) for i in gen: print(i) print("\nUsing next after using generator in loop:") print(next(gen))
def read_sentences_normalize_ne(stanford_file_name): stanford_file = codecs.open(stanford_file_name, 'r', 'utf-8') sentences = [] tokens = [] token_alignments = [] text_line = '' state = False ne_state = False money_state = False percent_state = False number_state = False ordinal_state = False time_state = False date_state = False duration_state = False set_state = False last_ne_tag = '' token_counter = 0 date_re = re.compile(r'^(\d\d\d\d|XXXX)-(\d\d|XX)-(\d\d|XX)$') date2_re = re.compile(r'^(\d\d\d\d|XXXX)-(\d\d|XX)$') date3_re = re.compile(r'^(\d\d\d\d|XXXX)$') for line in stanford_file: if line.startswith('Sentence #'): if state: sentences.append(asent.Sentence(tokens, token_alignments)) tokens = [] token_alignments = [] state = False ne_state = False money_state = False percent_state = False number_state = False ordinal_state = False time_state = False date_state = False duration_state = False set_state = False last_ne_tag = '' token_counter = 0 elif line.startswith('[Text=') and line[-2] == ']': token = asent.Token.parse_stanford_line(line[1:-2], {}) #For LOCATION, PERSON, ORGANIZATION, MISC. if ne_state and not (token.is_ne and token.ne_tag == last_ne_tag): ne_state = False if not ne_state and token.is_ne and token.ne_tag in \ ['LOCATION', 'PERSON', 'ORGANIZATION', 'MISC']: ne_state = True # Appends to the front. last_ne_tag = token.ne_tag token.constant_label = 'name' token.const_lexeme = token.word # For MONEY: if money_state and not (token.is_ne and token.ne_tag == 'MONEY'): money_state = False elif not money_state and token.is_ne and token.ne_tag == 'MONEY': money_state = True money_str = token.normalized_ne_tag if len(money_str) == 0: # Not treated as money. token.is_ne = False token.ne_tag = '' money_state = False elif len(money_str) > 1: # length 1 is for units unit_ind = 1 if money_str[0] in ['>', '<', '~'] else 0 if money_str[1] == '=': unit_ind = 2 token.const_lexeme = convert_number(money_str, True) # Percentage. if percent_state and not (token.is_ne and token.ne_tag == 'PERCENT'): percent_state = False elif not percent_state and token.is_ne and token.ne_tag == 'PERCENT': percent_state = True percent_str = token.normalized_ne_tag if len(percent_str) > 1: token.normalized_ne_tag = convert_number(percent_str, True) if number_state and not (token.is_ne and token.ne_tag == 'NUMBER'): number_state = False elif not number_state and token.is_ne and token.ne_tag == 'NUMBER': number_state = True number_str = token.normalized_ne_tag if len(number_str) == 0: number_state = False token.is_ne = False token.ne_tag = '' else: token.const_lexeme = convert_number(number_str, False) if ordinal_state and not (token.is_ne and token.ne_tag == 'ORDINAL'): ordinal_state = False elif not ordinal_state and token.is_ne and token.ne_tag == 'ORDINAL': ordinal_state = True number_str = token.normalized_ne_tag if len(number_str) == 0: number_state = False token.is_ne = False token.ne_tag = '' else: token.const_lexeme = convert_number(number_str, False) if time_state and not (token.is_timex and token.ne_tag in ['DATE', 'TIME']): time_state = False elif not time_state and (token.is_timex and token.ne_tag in ['DATE', 'TIME']): # The same date and time expression and contain both DATE and TIME. time_state = True if time_state and not date_state and token.ne_tag == 'DATE': # Only match pure date expressions # - cannot convert compound expressions cleanly enough. date_str = token.normalized_ne_tag if len(date_str.split()) == 1: # Strip time from string. if 'T' in date_str: date_str = date_str[:date_str.index('T')] if re.match(r'^\d\d\dX$', date_str): date_str = date_str[:3] + '0' if re.match(r'^\d\dXX$', date_str): date_str = date_str[:2] + '00' m = date_re.match(date_str) m2 = date2_re.match(date_str) m3 = date3_re.match(date_str) if m or m2 or m3: date_state = True if m: date_list = list(m.groups()) elif m2: date_list = list(m2.groups()) elif m3: date_list = list(m3.groups()) date_list = filter(lambda d: 'X' not in d, date_list) date_list = [ convert_number(date, False) for date in date_list ] if date_list: token.const_lexeme = date_list[0] #else don't handle as a date. if date_state and token.ne_tag <> 'DATE': date_state = False # For Duration: if duration_state and not (token.is_timex and token.ne_tag == 'DURATION'): duration_state = False elif not duration_state and token.is_timex and token.ne_tag == 'DURATION': duration_state = True time_str = token.normalized_ne_tag period, unit = convert_period(time_str) if period == 0: duration_state = False else: token.const_lexeme = str(period) token.ne_tag += '_' + unit # For SET: if set_state and not (token.is_timex and token.ne_tag == 'SET'): set_state = False elif not set_state and token.is_timex and token.ne_tag == 'SET': set_state = True freq = 1 period = 0 unit = '' if token.timex_attr.has_key('freq'): rate_re = re.compile(r'P(\d\d*)([A-Z])') freq_m = rate_re.match(token.timex_attr['freq']) freq = int(freq_m.group(1)) if token.timex_attr.has_key('periodicity'): period, unit = convert_period( token.timex_attr['periodicity']) if period == 0: set_state = False token.ne_tag = '' else: if freq > 1: token_ne_tag += '_rate' token.const_lexeme = str(period) token.ne_tag += '_temporal_' + unit # Identify numbers: if re.match(r'^[+-]?\d+(\.\d+)?$', token.word): if token.const_lexeme == '': token.const_lexeme = convert_number(token.word, False) token.constant_label = 'number' token.pred_lexeme = token.word tokens.append(token) state = True if state: sentences.append(asent.Sentence(tokens)) return sentences
def train(args: Dict): MAX_LEN = int(args['--max-len']) bs = int(args['--batch-size']) model_root = args['--model-root'] if args['--model-root'] else './models' dataLoader= sentence.Sentence(args['--train-src']) device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) if args['--cuda']: n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0) tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences] print(dataLoader.sentences[0]) print(tokenized_texts[0]) input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels], maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i > 0) for i in ii] for ii in input_ids] """ The BERT Model requires us to have a [SEP] token at the end of each sentence as a part of its preprocessing. 102 is the index BERT recognizes as the index of [SEP]. Hence, I am adding it to the end of the sentence after padding/truncating (as it might have been removed if the sequences were greater than 75 in length) to be compatible with BERT's requirement. I didn't have it in the beginning and I thought it would be the reason for the poor results but changing it didn't help and I chose to keep it anyways as it felt right. :) """ for i, inp in enumerate(input_ids): if (102 not in inp): inp[-1] = 102 tags[i][-1] = dataLoader.tag2idx.get("O") tts = float(args['--train-test-split']) tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=10, test_size=tts) tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=10, test_size=tts) tr_inputs = torch.tensor(tr_inputs).to(torch.int64) val_inputs = torch.tensor(val_inputs).to(torch.int64) tr_tags = torch.tensor(tr_tags).to(torch.int64) val_tags = torch.tensor(val_tags).to(torch.int64) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs) model = BertForTokenClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=len(dataLoader.tag2idx)) if args['--cuda']: model.cuda() FULL_FINETUNING = True if args['--full-finetuning'] else False if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] else: param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}] optimizer = Adam(optimizer_grouped_parameters, lr=float(args['--lr'])) epochs = int(args['--max-epoch']) max_grad_norm = 1.0 hist_valid_scores = [] for _ in trange(epochs, desc="Epoch"): # TRAIN loop model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): # add batch to gpu batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch # forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # backward pass loss.backward() # track train loss tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 # gradient clipping torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm) # update parameters optimizer.step() model.zero_grad() # print train loss per epoch print("Train loss: {}".format(tr_loss / nb_tr_steps)) # VALIDATION on validation set model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 predictions, true_labels = [], [] for batch in valid_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print("Validation loss: {}".format(eval_loss)) print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) pred_tags = [dataLoader.tags_vals[p_i] for p in predictions for p_i in p] valid_tags = [dataLoader.tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i] f1=f1_score(valid_tags,pred_tags) print("F1-Score: {}".format(f1)) is_better = len(hist_valid_scores) == 0 or f1 > max(hist_valid_scores) hist_valid_scores.append(f1) if is_better: output_model_file = os.path.join(model_root, "model_file.bin") output_config_file = os.path.join(model_root, "config_file.bin") output_vocab_file = model_root model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_vocab_file) print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def evaluate(args:Dict): model_root = args['--model-root'] if args['--model-root'] else './models' print("load model from {}".format(model_root), file=sys.stderr) dataLoader = sentence.Sentence(args['--test-src']) device = torch.device("cuda:0" if args['--cuda'] else "cpu") output_model_file = os.path.join(model_root, "model_file.bin") output_config_file = os.path.join(model_root, "config_file.bin") output_vocab_file = os.path.join(model_root, "vocab.txt") config = BertConfig.from_json_file(output_config_file) model = BertForTokenClassification(config,num_labels=len(dataLoader.tag2idx)) state_dict = torch.load(output_model_file) model.load_state_dict(state_dict) tokenizer = BertTokenizer(output_vocab_file, do_lower_case=False) tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences] if args['--cuda']: model = model.to(torch.device("cuda:0")) MAX_LEN = int(args['--max-len']) input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags_test = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels], maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks_test = [[float(i > 0) for i in ii] for ii in input_ids_test] for i, inp in enumerate(input_ids_test): if (102 not in inp): inp[-1] = 102 tags_test[i][-1] = dataLoader.tag2idx.get("O") te_inputs = torch.tensor(input_ids_test).to(torch.int64) te_tags = torch.tensor(tags_test).to(torch.int64) te_masks = torch.tensor(attention_masks_test) test_data = TensorDataset(te_inputs, te_masks, te_tags) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=int(args['--batch-size'])) model.eval() predictions = [] true_labels = [] eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in test_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) label_ids = b_labels.to('cpu').numpy() true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 pred_tags = [[dataLoader.tags_vals[p_i] for p_i in p] for p in predictions] test_tags = [[dataLoader.tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l] tags_test_fin = list() for l in tags_test: temp_tag = list() for l_i in l: temp_tag.append(dataLoader.tags_vals[l_i]) tags_test_fin.append(temp_tag) print("Test loss: {}".format(eval_loss / nb_eval_steps)) print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps)) print("Test F1-Score: {}".format(f1_score(tags_test_fin, pred_tags))) print(classification_report(tags_test_fin, pred_tags)) print("Number of Test sentences: ", len(tags_test_fin))
def get_sentence_list(self): sentences = [sent for sent in nlp(self.transcript).sents] return [ sentence.Sentence(self, s.start_char, s.end_char) for s in sentences ]
def read_sentences(stanford_file_name, file_id): stanford_file = codecs.open(stanford_file_name, 'r', 'utf-8') sentences = [] raw_sentences = [] tokens = [] text_line = '' state_line = '' sent_offset = 0 state = False state1 = False for line in stanford_file: if line.startswith('Sentence #'): if state: sentences.append(asent.Sentence(tokens)) sentences[-1].offset = sent_offset sentences[-1].raw_txt = text_line sentences[-1].file_id = file_id text_line = '' state_line = '' tokens = [] state = False state1 = False elif len(line) > 1 and line[-2] == ']' and (state or line.startswith('[Text=')): if state_line: token = asent.Token.parse_stanford_line( state_line + ' ' + line[:-2], {}) else: token = asent.Token.parse_stanford_line(line[1:-2], {}) if not state1: sent_offset = token.char_start ind_start = token.char_start - sent_offset ind_end = token.char_end - sent_offset token.reset_char_spans(ind_start, ind_end) word = token.original_word word = word.replace(u"\u00A0", "_") if '_' in word: split_word = word.split('_') split_inds = filter(lambda x: word[x] == '_', range(len(word))) first_word = word[:split_inds[0]] token.original_word = first_word token.word = first_word if normalize_ne: token.pred_lexeme = first_word.lower() else: token.pred_lexeme = first_word.lower( ) + u'/' + token.pos.lower() token.const_lexeme = first_word token.char_end = token.char_start + split_inds[0] tokens.append(token) for j, w in enumerate(split_word[1:]): char_start = token.char_start + split_inds[j] + 1 if j + 1 < len(split_inds): char_end = token.char_start + split_inds[j + 1] else: char_end = token.char_start + len(word) new_token = asent.Token(w, w, token.pos, token.constant_label, token.is_ne, token.is_timex, token.ne_tag, token.normalized_ne_tag, char_start=char_start, char_end=char_end) tokens.append(new_token) else: tokens.append(token) state = True state1 = True elif line.startswith('[Text='): state_line = line[1:].strip() state = True else: #if line.strip(): if state: state_line += ' ' + line.strip() else: text_line += line.replace('\n', ' ') if state: sentences.append(asent.Sentence(tokens)) sentences[-1].offset = sent_offset sentences[-1].raw_txt = text_line sentences[-1].file_id = file_id return sentences
def __init__(self, user_input, language_string): self.sentence = sentence.Sentence(user_input) self.ngrams = ngram_model.Ngrams(self.sentence, Language(language_string)) self.mistake_positions = error_checker.get_mistake_positons( self.ngrams)
def writeInfile(data, filename): tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') subword_len_counter = 0 with open(filename, 'wt', encoding='utf-8') as f: for sentence in data: for (token, key) in sentence: current_subwords_len = len(tokenizer.tokenize(token)) if current_subwords_len == 0: continue if (subword_len_counter + current_subwords_len) > 512: f.write("\n") f.write((token + ' ' + key + '\n')) subword_len_counter = 0 continue subword_len_counter += current_subwords_len f.write((token + ' ' + key + '\n')) f.write('\n') if __name__ == '__main__': traindevgetter = sentence.Sentence('pioner-silver/train.conll03') testgetter = sentence.Sentence('pioner-silver/dev.conll03') train, dev = partitionRankings(traindevgetter.tagged_sentences, 0.1) writeInfile(list(train), 'data/train.txt') writeInfile(list(dev), 'data/dev.txt') writeInfile(list(testgetter.tagged_sentences), 'data/test.txt')
def genParagraph(): """generate paragraph""" num_sentences = random.randrange(10, 30) sentences = [(" ".join(sentence.Sentence())).capitalize() + "." for i in range(num_sentences)] return "<p>{0}</p>".format(" ".join(sentences))