def instances(fi): xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() for line in fi: line = line.strip('\n') if not line: # An empty line presents an end of a sequence. yield xseq, tuple(yseq) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() continue # Split the line with TAB characters. fields = line.split('\t') # Append attributes to the item. item = crfsuite.Item() for field in fields[1:]: p = field.rfind(':') if p == -1: # Unweighted (weight=1) attribute. item.append(crfsuite.Attribute(field)) else: # Weighted attribute item.append(crfsuite.Attribute(field[:p], float(field[p + 1:]))) # Append the item to the item sequence. xseq.append(item) # Append the label to the label sequence. yseq.append(fields[0])
def read_file_to_crfsuite(crf_input_file, crf_trainer, feature_inclusion_list, participant_list): # if os.path.isfile('min_max_dataframe'): # min_max = pd.load('min_max_dataframe') # else: # min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list) #min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list) import crfsuite f = open(crf_input_file, 'r') feature_index_list = [] header = [] xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() for line in f: # ignore blank lines if line.strip(' \t\n\r') == "": continue if "label" in line: feature_index_list = get_feature_index_list( line, feature_inclusion_list) header = line.split('\t') continue if "START" in line: continue if "END" in line: print 'found END' # exit() if participant not in participant_list: continue crf_trainer.append(xseq, yseq, participant_group) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() else: item = crfsuite.Item() fields = line.split('\t') participant = fields[1].strip('"') if participant not in participant_list: print 'participant ' + participant + ' not found' continue participant_group = participant_list.index(participant) for i in range(0, len(fields)): if i in feature_index_list: attribute_name = header[i] if (fields[i] == 'NA'): attribute_val = 0 else: attribute_val = float(fields[i]) item.append( crfsuite.Attribute(attribute_name, attribute_val)) xseq.append(item) yseq.append(fields[0].strip('"'))
def read_file_to_crfsuite(crf_input_file, feature_inclusion_list, crf_tagger, output, options_dict): sliding_window_length = options_dict['sliding_window_length'] import crfsuite f = open(crf_input_file, 'r') #min_max = pd.load('min_max_dataframe') xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() overlapped_predictions = [] for line in f: if line.strip(' \t\n\r')=="": continue if "label" in line: feature_index_list = crf_train.get_feature_index_list(line, feature_inclusion_list) header = line.split('\t') continue if "START" in line: continue if "END" in line: #crf_tagger.set(xseq) #prediction_seq = util.convert_to_python_list(crf_tagger.tag(xseq)) prediction_seq = crf_tagger.tag(xseq) label_seq = util.convert_to_python_list (yseq) if (sliding_window_length != 0 ): overlapped_predictions = write_prediction_to_file (prediction_seq, label_seq, overlapped_predictions, output , options_dict) else: y_itr = yseq.iterator() for prediction in prediction_seq: #print 'straightforward' label = y_itr.next() output.write(prediction.strip() + "," + label.strip()+"\n") xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() else: item = crfsuite.Item() fields = line.split('\t') for i in range(0,len(fields)): if i in feature_index_list: attribute_name = header[i] if(fields[i] == 'NA'): attribute_val = 0 else: attribute_val = float(fields[i]) item.append(crfsuite.Attribute(attribute_name, attribute_val)) xseq.append(item) yseq.append(fields[0].strip('"'))
def instances(self,fi): xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() for line in fi: line = line.strip('\n') if not line: yield xseq, tuple(yseq) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() continue item = crfsuite.Item() fields = line.split('\t') yseq.append(fields[0]) for field in fields[1:]: p = field.rfind(':') if p == -1: item.append(crfsuite.Attribute(field)) else: item.append(crfsuite.Attribute(field[:p], float(field[p+1:]))) xseq.append(item)
def read_svm_format(lines): # reads lines like: # Y 15:0.4 16:0.01 19:3.4 # or # X 4 9 23 # and iterates over sentences (which are separated by a whitespace-only line). # it yield pairs like: # (crfsuite.ItemSequence([ # crfsuite.Item([crfsuite.Attribute("15"->0.4), crfsuite.Attribute("16"->0.01), crfsuite.Attribute("19"->3.4)]), # crfsuite.Item([crfsuite.Attribute("4"), crfsuite.Attribute("9"), crfsuite.Attribute("23")]) # ]), ("X", "Y", ...)) # crfsuite.Attribute has 2 properties: attr, and value for sentence_lines in group_by_newline(lines): data = crfsuite.ItemSequence() labels = crfsuite.StringList() for line in sentence_lines: # Split the line with TAB characters. # print '>>>', line cells = line.strip().split(' ') datum = crfsuite.Item() for data_field in cells[1:]: # don't split if the whole field is a literal colon parts = data_field.rsplit( ':', 1) if data_field != ':' else data_field if len(parts) > 1: # we read the optional weight: datum.append(crfsuite.Attribute(parts[0], float(parts[1]))) else: # otherwise, weight = 1 by default datum.append(crfsuite.Attribute(parts[0])) # Append the item to the item sequence. data.append(datum) # Append the label to the label sequence. labels.append(cells[0]) # empty line is document boundary yield (data, tuple(labels))
def __instances(self,fileRead, wordVectors, windowSize, useManualFeature): xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() defval = u'' dataset = codecs.open(fileRead, 'r', 'utf-8') for line in dataset: i = 0 tokens = [] labels = [] tokensWithLabels = line.rstrip().split(' ') # currentTime = calendar.timegm(time.gmtime()) # if currentTime - instances.lastTimePrintedMsg > 30.0: # instances.lastTimePrintedMsg = currentTime # logger.info("Processing File. Memory usage: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) for token in tokensWithLabels: if token.isspace() or not token: continue t = token.rsplit(self.__tokenLabelSeparator,1) if len(t[1]) == 0: logging.getLogger("Logger").warn("It was not found the label from "\ "the token " + token + ". We give to this token "\ " a label equal to"\ " the tokenLabelSeparator( " + self.__tokenLabelSeparator +")" ) t[1] = self.__tokenLabelSeparator try: tokens.append(t[0]) labels.append(t[1]) except Exception: print t print line halfWindowSize = windowSize / 2 for i in range(len(tokens)): beginIndex = i - halfWindowSize item = crfsuite.Item() if useManualFeature: item.append(crfsuite.Attribute(self.__createFeature("num" , str(int(re.search('\d', tokens[i]) is not None))))) item.append(crfsuite.Attribute(self.__createFeature("cap" , str(any(c.isupper() for c in tokens[i]))))) item.append(crfsuite.Attribute(self.__createFeature("hyp" , str(int(re.search('-', tokens[i]) is not None))))) # prefixos item.append(crfsuite.Attribute(self.__createFeature("p1", tokens[i][0] if len(tokens[i]) >= 1 else defval))) item.append(crfsuite.Attribute(self.__createFeature("p2", tokens[i][:2] if len(tokens[i]) >= 2 else defval))) item.append(crfsuite.Attribute(self.__createFeature("p3", tokens[i][:3] if len(tokens[i]) >= 3 else defval))) item.append(crfsuite.Attribute(self.__createFeature("p4", tokens[i][:4] if len(tokens[i]) >= 4 else defval))) # sufixos item.append(crfsuite.Attribute(self.__createFeature("s1", tokens[i][-1] if len(tokens[i]) >= 1 else defval))) item.append(crfsuite.Attribute(self.__createFeature("s2", tokens[i][-2:] if len(tokens[i]) >= 2 else defval))) item.append(crfsuite.Attribute(self.__createFeature("s3", tokens[i][-3:] if len(tokens[i]) >= 3 else defval))) item.append(crfsuite.Attribute(self.__createFeature("s4", tokens[i][-4:] if len(tokens[i]) >= 4 else defval))) for featureTemplate, indexFeature in itertools.izip(CRFSuite.__featuresTemplates, range(len(CRFSuite.__featuresTemplates))): namesFeature = [] valuesFeature = [] for name, index in featureTemplate: namesFeature.append(name + "[" + str(index) + "]") valuesFeature.append(tokens[i + index] if i + index >= 0 and i + index < len(tokens) else defval) names = "|".join(namesFeature) values = "|".join(valuesFeature) item.append(crfsuite.Attribute(self.__createFeature(names, values))) for j in range(windowSize): index = beginIndex + j label = str(j) + u'|' if index < 0: token = self.__startSentenceSymbol elif index >= len(tokens): token = self.__endSentenceSymbol else: token = tokens[index] for filter in self.__filters: token = filter.filter(token) k = 0 for wordvector in wordVectors: if token in wordvector: wv = wordvector[token] else: for unknownToken in self.__unknownTokens: if unknownToken in wordvector: wv = wordvector[unknownToken] break; for number in wv: item.append(crfsuite.Attribute(self.__createFeature(label + str(k),'_'), number)) k += 1 xseq.append(item) yseq.append(unicodeToSrt(labels[i])) yield xseq, tuple(yseq) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList()