## Pass the ML classifier through to the parse SUTime entities method. ## Loop through each file and parse for f in range(0, len(infiles)): print("Parsing " + infiles[f] + " ...") ## Init the ChronoEntity list my_chronoentities = [] my_chrono_ID_counter = 1 ## parse out the doctime doctime = utils.getDocTime(infiles[f] + ".dct") if (debug): print(doctime) ## parse out reference tokens text, tokens, spans, tags, sents = utils.getWhitespaceTokens( infiles[f] + args.x) #my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, remove_stopwords="./Chrono/stopwords_short2.txt") my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, pos=tags, sent_boundaries=sents) ## mark all ref tokens if they are numeric or temporal chroList = utils.markTemporal(my_refToks) if (debug): print("REFERENCE TOKENS:\n") for tok in chroList: print(tok) tempPhrases = utils.getTemporalPhrases(chroList, doctime)
def createMLTrainingMatrix(infiles, gold_folder, ext="", save=False, output="aquaint_train", window=3): ### Algorithm ## For each input file: ## 1) parse text to refTokens list ## 2) parse SUTime to identify temporal tokens ## 3) Import gold standard file ## 4) Get list of periods and intervals with start and end coords ## 5) For each period/interval: ## - Create feature vector ## - Save features to global list ## 6) Write gold features to a csv file for import by other scripts to train ML methods. ## define list of dictionary feature vectors obs_list = [] ### This is the list of features for each observation category = [ ] ### This is the category of the observation. 1 for period, 0 otherwise. Note that the unknowns are being grouped in with the calendar-interval category. probably need to parse that out later or change up the algorithm to not be a binary classifier. features = { 'feat_numeric': 0, 'feat_temp_context': 0, 'feat_temp_self': 0 } ### This is the full list of features. I will use the key values to get the individual feature vectors. if (save): outfile = open("./gold-standard-parsing.txt", 'w') ## Loop through each file and parse for f in range(0, len(infiles)): print("ML Parsing " + infiles[f] + " ...") ## parse out the doctime doctime = utils.getDocTime(infiles[f] + ".dct") if (debug): print(doctime) ## parse out reference tokens text, tokens, spans, tags = utils.getWhitespaceTokens(infiles[f] + ext) my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, pos=tags) ## mark all ref tokens if they are numeric or temporal chroList = utils.markTemporal(my_refToks) ## import gold standard data gold_file = os.path.join(gold_folder, os.path.split(infiles[f])[1], "period-interval.gold.csv") gold_list = [] if not os.path.exists(gold_file): print(gold_file + " DOES NOT EXISTS") break if os.path.exists(gold_file): if (save): outfile.write("\n$$$$$$$$$$$\nProcessing: " + gold_file) with open(gold_file) as file: reader = csv.DictReader(file) for row in reader: gold_list.append({ 'type': row['type'], 'start': row['start'], 'end': row['end'], 'value': row['value'] }) if (save): outfile.write("\n" + str(row)) ## loop through each reftoken term and see if it overlaps with a gold token for r in range(0, len(chroList)): reftok = chroList[r] ref_s, ref_e = reftok.getSpan() # loop through each gold instance and find the one that overlaps with the current reftok. for g in gold_list: # print(str(g)) if utils.overlap( [ref_s, ref_e], [int(g['start']), int(g['end'])]): this_obs = {} # if the gold token overlaps with the current reftok we need to extract the features from the reftok and add it to the list if (save): outfile.write("\nPrevious Token: " + str(chroList[max(r - 1, 0)])) outfile.write("\nTarget Token: " + str(reftok)) #print("Length: "+ str(len(my_refToks)) + "Last: "+str(min(r+1, len(my_refToks)))) outfile.write( "\nNext Token: " + str(chroList[min(r + 1, len(my_refToks) - 1)]) + "\n") ### Identify Temporal features this_obs = extract_temp_features( chroList, r, 3, this_obs) ### Extract all words within a N-word window this_obs, observations = extract_bow_features( chroList, r, window, features, this_obs) ### Determine if there is a numeric before or after the target word. this_obs = extract_numeric_feature( chroList, r, this_obs) ### Stem and extract the actual word this_obs, observations = extract_stem_feature( chroList[r], features, this_obs) ### Get the correct type if (g['type'] == 'Period'): category.append(1) else: category.append(0) obs_list.append(this_obs) ## Ok, I have all the features. Now I just need to put them all together in a matrix. print("features length: " + str(len(features.keys()))) print("obs_list length: " + str(len(obs_list))) print("category length: " + str(len(category))) ## Now I need to loop through the obs_list to create a list of features that contain all feature elements. full_obs_list = [] # a list of tuples for i in range(0, len(obs_list)): feats = deepcopy(features) feats.update(obs_list[i]) #full_obs_list.append((feats, category[i])) full_obs_list.append(feats) ## Now print the list of tuples to a file, then return the list. keys = full_obs_list[0].keys() with open(output + '_data.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(full_obs_list) with open(output + '_class.csv', 'w') as output_file: for c in category: output_file.write("%s\n" % c) ### Now return the feature list and the categories return (full_obs_list, category)