def instances(fi):
    xseq = crfsuite.ItemSequence()
    yseq = crfsuite.StringList()

    for line in fi:
        line = line.strip('\n')
        if not line:
            # An empty line presents an end of a sequence.
            yield xseq, tuple(yseq)
            xseq = crfsuite.ItemSequence()
            yseq = crfsuite.StringList()
            continue

# Split the line with TAB characters.
        fields = line.split('\t')

        # Append attributes to the item.
        item = crfsuite.Item()
        for field in fields[1:]:
            p = field.rfind(':')
            if p == -1:
                # Unweighted (weight=1) attribute.
                item.append(crfsuite.Attribute(field))
            else:
                # Weighted attribute
                item.append(crfsuite.Attribute(field[:p],
                                               float(field[p + 1:])))

        # Append the item to the item sequence.
        xseq.append(item)
        # Append the label to the label sequence.
        yseq.append(fields[0])
def read_file_to_crfsuite(crf_input_file, crf_trainer, feature_inclusion_list,
                          participant_list):
    # if os.path.isfile('min_max_dataframe'):
    #   min_max = pd.load('min_max_dataframe')
    # else:
    #   min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list)
    #min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list)
    import crfsuite
    f = open(crf_input_file, 'r')
    feature_index_list = []
    header = []
    xseq = crfsuite.ItemSequence()
    yseq = crfsuite.StringList()
    for line in f:
        # ignore blank lines
        if line.strip(' \t\n\r') == "":
            continue
        if "label" in line:
            feature_index_list = get_feature_index_list(
                line, feature_inclusion_list)
            header = line.split('\t')
            continue
        if "START" in line:
            continue
        if "END" in line:
            print 'found END'
            # exit()
            if participant not in participant_list:
                continue
            crf_trainer.append(xseq, yseq, participant_group)
            xseq = crfsuite.ItemSequence()
            yseq = crfsuite.StringList()
        else:
            item = crfsuite.Item()
            fields = line.split('\t')
            participant = fields[1].strip('"')
            if participant not in participant_list:
                print 'participant ' + participant + ' not found'
                continue
            participant_group = participant_list.index(participant)
            for i in range(0, len(fields)):
                if i in feature_index_list:
                    attribute_name = header[i]
                    if (fields[i] == 'NA'):
                        attribute_val = 0
                    else:
                        attribute_val = float(fields[i])
                    item.append(
                        crfsuite.Attribute(attribute_name, attribute_val))
            xseq.append(item)
            yseq.append(fields[0].strip('"'))
Beispiel #3
0
def read_file_to_crfsuite(crf_input_file, feature_inclusion_list, crf_tagger, output, options_dict):    
    sliding_window_length = options_dict['sliding_window_length']
    import crfsuite
    f = open(crf_input_file, 'r')
    #min_max  = pd.load('min_max_dataframe')
    xseq = crfsuite.ItemSequence()
    yseq = crfsuite.StringList()
    overlapped_predictions = []
    for line in f:        
        if line.strip(' \t\n\r')=="":
            continue
        if "label" in line:
            feature_index_list = crf_train.get_feature_index_list(line, feature_inclusion_list)
            header = line.split('\t')
            continue  
        if "START" in line:
            continue        
        if "END" in line:
            #crf_tagger.set(xseq)    
            #prediction_seq = util.convert_to_python_list(crf_tagger.tag(xseq))
            prediction_seq = crf_tagger.tag(xseq)
            label_seq = util.convert_to_python_list (yseq)
            if (sliding_window_length != 0 ):
                overlapped_predictions = write_prediction_to_file (prediction_seq, label_seq, overlapped_predictions, output , options_dict)
            else:
                y_itr = yseq.iterator()
                for prediction in prediction_seq:
                    #print 'straightforward'
                    label = y_itr.next()   
                    output.write(prediction.strip() + "," + label.strip()+"\n")

            xseq = crfsuite.ItemSequence()
            yseq = crfsuite.StringList()
        else:
            item = crfsuite.Item()
            fields = line.split('\t')
            for i in range(0,len(fields)):
                if i in feature_index_list:
                    attribute_name = header[i]
                    if(fields[i] == 'NA'):
                        attribute_val = 0
                    else:
                        attribute_val = float(fields[i])                    
                    item.append(crfsuite.Attribute(attribute_name, attribute_val))
            xseq.append(item)            
            yseq.append(fields[0].strip('"'))
Beispiel #4
0
 def instances(self,fi):
     xseq = crfsuite.ItemSequence()
     yseq = crfsuite.StringList()
 
     for line in fi:
         line = line.strip('\n')
         if not line:
             yield xseq, tuple(yseq)
             xseq = crfsuite.ItemSequence()
             yseq = crfsuite.StringList()
             continue
         
         item = crfsuite.Item()
         fields = line.split('\t')
         yseq.append(fields[0])
         for field in fields[1:]:
             p = field.rfind(':')
             if p == -1:
                 item.append(crfsuite.Attribute(field))
             else:
                 item.append(crfsuite.Attribute(field[:p], float(field[p+1:])))
         xseq.append(item)
Beispiel #5
0
def read_svm_format(lines):
    # reads lines like:
    # Y 15:0.4 16:0.01 19:3.4
    # or
    # X 4 9 23
    # and iterates over sentences (which are separated by a whitespace-only line).
    # it yield pairs like:
    #   (crfsuite.ItemSequence([
    #      crfsuite.Item([crfsuite.Attribute("15"->0.4), crfsuite.Attribute("16"->0.01), crfsuite.Attribute("19"->3.4)]),
    #      crfsuite.Item([crfsuite.Attribute("4"), crfsuite.Attribute("9"), crfsuite.Attribute("23")])
    #    ]), ("X", "Y", ...))
    # crfsuite.Attribute has 2 properties: attr, and value
    for sentence_lines in group_by_newline(lines):
        data = crfsuite.ItemSequence()
        labels = crfsuite.StringList()
        for line in sentence_lines:
            # Split the line with TAB characters.
            # print '>>>', line
            cells = line.strip().split(' ')
            datum = crfsuite.Item()
            for data_field in cells[1:]:
                # don't split if the whole field is a literal colon
                parts = data_field.rsplit(
                    ':', 1) if data_field != ':' else data_field
                if len(parts) > 1:
                    # we read the optional weight:
                    datum.append(crfsuite.Attribute(parts[0], float(parts[1])))
                else:
                    # otherwise, weight = 1 by default
                    datum.append(crfsuite.Attribute(parts[0]))

            # Append the item to the item sequence.
            data.append(datum)
            # Append the label to the label sequence.
            labels.append(cells[0])
            # empty line is document boundary
        yield (data, tuple(labels))
Beispiel #6
0
 def __instances(self,fileRead, wordVectors, windowSize, useManualFeature):
     xseq = crfsuite.ItemSequence()
     yseq = crfsuite.StringList()
     defval = u''
     dataset = codecs.open(fileRead, 'r', 'utf-8')
     
     
     for line in dataset:
         i = 0
         tokens = []
         labels = []
         tokensWithLabels = line.rstrip().split(' ')
         
 #         currentTime = calendar.timegm(time.gmtime())
         
 #         if currentTime - instances.lastTimePrintedMsg > 30.0:
 #             instances.lastTimePrintedMsg = currentTime
 #             logger.info("Processing File. Memory usage: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
 
         for token in tokensWithLabels:
             if token.isspace() or not token:
                 continue
             
             t = token.rsplit(self.__tokenLabelSeparator,1)
             
             if len(t[1]) == 0:
                 logging.getLogger("Logger").warn("It was not found the label from "\
                              "the token " + token + ". We give to this token "\
                              " a label equal to"\
                              " the tokenLabelSeparator( " + self.__tokenLabelSeparator +")" )
                   
                 t[1] = self.__tokenLabelSeparator
             
             try:
                 tokens.append(t[0])
                 labels.append(t[1])
             except Exception:
                 print t
                 print line
                 
                 
         
         halfWindowSize = windowSize / 2
         
         for i in range(len(tokens)):
             beginIndex = i - halfWindowSize
             item = crfsuite.Item()
             
             
             if useManualFeature:
                 item.append(crfsuite.Attribute(self.__createFeature("num" , str(int(re.search('\d', tokens[i]) is not None)))))
                 item.append(crfsuite.Attribute(self.__createFeature("cap" , str(any(c.isupper() for c in tokens[i])))))
                 item.append(crfsuite.Attribute(self.__createFeature("hyp" , str(int(re.search('-', tokens[i]) is not None)))))
              
                 # prefixos
                 item.append(crfsuite.Attribute(self.__createFeature("p1", tokens[i][0] if len(tokens[i]) >= 1 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("p2", tokens[i][:2] if len(tokens[i]) >= 2 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("p3", tokens[i][:3] if len(tokens[i]) >= 3 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("p4", tokens[i][:4] if len(tokens[i]) >= 4 else defval)))
                  
                 # sufixos
                 item.append(crfsuite.Attribute(self.__createFeature("s1", tokens[i][-1] if len(tokens[i]) >= 1 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("s2", tokens[i][-2:] if len(tokens[i]) >= 2 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("s3", tokens[i][-3:] if len(tokens[i]) >= 3 else defval)))
                 item.append(crfsuite.Attribute(self.__createFeature("s4", tokens[i][-4:] if len(tokens[i]) >= 4 else defval)))
                  
                  
                 for featureTemplate, indexFeature in itertools.izip(CRFSuite.__featuresTemplates, range(len(CRFSuite.__featuresTemplates))):
                     namesFeature = []
                     valuesFeature = []
                      
                     for name, index in featureTemplate:
                         namesFeature.append(name + "[" + str(index) + "]")
                         valuesFeature.append(tokens[i + index] if i + index >= 0 and i + index < len(tokens)  else defval)
                      
                     names = "|".join(namesFeature)
                     values = "|".join(valuesFeature)
                      
                     item.append(crfsuite.Attribute(self.__createFeature(names, values)))
     
             for j in range(windowSize):            
                 index = beginIndex + j
                 label = str(j) + u'|'
                 
                 if index < 0:
                     token = self.__startSentenceSymbol
                 elif index >= len(tokens):
                     token = self.__endSentenceSymbol
                 else:
                     token = tokens[index]
                     
                     for filter in self.__filters:
                         token = filter.filter(token)
                 
                 k = 0
                 for wordvector in wordVectors:
                     if token in wordvector:
                         wv = wordvector[token]
                     else:
                         for unknownToken in self.__unknownTokens:
                             if unknownToken in wordvector:
                                 wv = wordvector[unknownToken]
                                 break;
                     
                     for number in wv:
                         item.append(crfsuite.Attribute(self.__createFeature(label + str(k),'_'), number))
                         
                         k += 1
                         
             xseq.append(item)
             yseq.append(unicodeToSrt(labels[i]))
             
         yield xseq, tuple(yseq)
         xseq = crfsuite.ItemSequence()
         yseq = crfsuite.StringList()