def read_class_data(path, label=None): ''' Label may come from the data itself, may be assigned at run time ''' if os.path.exists(path): if os.path.isdir(path): paths = [os.path.join(path, f) for f in os.listdir(path)] else: paths = [path] else: print 'Given path does not exist.' return doc = doc_file() stemmer = PorterStemmer() instances = [] for p in paths: doc.path = p for raw_record in doc: record = unpack(raw_record, ',') text = record[3].strip('"') inst = {'tokens': [], 'label': ''} for t in wordpunct_tokenize(text): stem_t = stemmer.stem(t.lower()) if stem_t[0].islower(): inst['tokens'].append(stem_t) else: continue inst['label'] = label instances.append(inst) return instances
def convert(bio_path, crff_path, feats, labeled=True, fmt='crfsuite'): try: output = open(crff_path, 'w') except IOError: print 'cannot open crff file' return df = doc_file(bio_path, is_sent) for s in df: sent = [t.split() for t in s.strip().split('\n')] if not labeled: for t in sent: t.append('empty') for token in sent: label = token[-1] if fmt == 'crfsuite': crff_line = label + '\t' + '\t'.join([f.func_name + '=' + str(f(token, sent)) for f in feats if f(token, sent)]) if fmt == 'mallet': crff_line = ' '.join([f.func_name + '=' + str(f(token, sent)) for f in feats if f(token, sent)]) + ' ' + label output.write(crff_line + '\n') output.write('\n') df.close() output.close()