Example #1
0
def read_class_data(path, label=None):
    '''
    Label may come from the data itself, may be assigned at run time
    '''
    if os.path.exists(path):
        if os.path.isdir(path):
            paths = [os.path.join(path, f) for f in os.listdir(path)]
        else:
            paths = [path]
    else:
        print 'Given path does not exist.'
        return
    
    doc = doc_file()
    stemmer = PorterStemmer()
    instances = []
    for p in paths:
        doc.path = p
        for raw_record in doc:
            record = unpack(raw_record, ',')
            text = record[3].strip('"')
            inst = {'tokens': [], 'label': ''}
            for t in wordpunct_tokenize(text):
                stem_t = stemmer.stem(t.lower())
                if stem_t[0].islower():
                    inst['tokens'].append(stem_t)
                else:
                    continue
            inst['label'] = label
            instances.append(inst)
    return instances
Example #2
0
def convert(bio_path, crff_path, feats, labeled=True, fmt='crfsuite'):
    try:
        output = open(crff_path, 'w')
    except IOError:
        print 'cannot open crff file'
        return
    df = doc_file(bio_path, is_sent)
    for s in df:
        sent = [t.split() for t in s.strip().split('\n')]
        if not labeled:
            for t in sent:
                t.append('empty')
        for token in sent:
            label = token[-1]
            if fmt == 'crfsuite':
                crff_line = label + '\t' + '\t'.join([f.func_name + '=' + str(f(token, sent)) for f in feats if f(token, sent)])
            if fmt == 'mallet':
                crff_line = ' '.join([f.func_name + '=' + str(f(token, sent)) for f in feats if f(token, sent)]) + ' ' + label
            output.write(crff_line + '\n')
        output.write('\n')
    df.close()
    output.close()