import json from libshorttext.classifier import TextModel from libshorttext.classifier import predict_single_text mod = TextModel('models/expends2012.description.sorted-labeled.csv.svm.model') data = json.load(open('/home/blannon/og_data/expenditures/expends12-unlabeled-csv.json')) out = open('results/expends2012.unlabeled.full.results.csv','w') for d in data['rows']: if d['descrip']: descrip = str(d['descrip']) else: descrip = '' r = predict_single_text(descrip.encode('ascii','ignore'),mod) print d['ID'],descrip,r.predicted_y out.write('\t'.join([str(a) for a in [d['ID'],r.predicted_y]])) out.write('\n') out.close()
import csv import unicodedata from libshorttext.classifier import TextModel from libshorttext.classifier import predict_single_text unlabeled_data_filename = 'data/unlabeled/descrip.unlabeled.csv' model_filename = 'models/descrip.labeled.csv.svm.model' mod = TextModel(model_filename) _dialect = csv.Sniffer().sniff(open(unlabeled_data_filename).readline(100)) _dialect.escapechar = '\\' data = csv.DictReader(open(unlabeled_data_filename), dialect=_dialect) out = csv.DictWriter(open('results/descrip.unlabeled.results.csv','w'), fieldnames=data.fieldnames+['guess'], dialect=_dialect) out.writeheader() for d in data: if d['descrip']: descrip = d['descrip'] else: descrip = '' descrip = descrip.decode('utf-8').encode('ascii','ignore') r = predict_single_text(descrip,mod) d['descrip'] = descrip d['guess'] = r.predicted_y out.writerow(d)
from libshorttext.classifier import TextModel from libshorttext.classifier import predict_single_text data = [line.strip().split('\t') for line in open('data/unlabeled/expends2012.description.unlabeled.csv')] mod = TextModel('models/expends2012.description.sorted-labeled.csv.svm.model') out = open('results/expends2012.description.unlabeled.results.csv','w') for d in data: r = predict_single_text(d[1],mod) out.write('\t'.join([r.predicted_y,d[1]])+'\n') out.close()
logger.info('initiating textmodel') svm_model = classifier.TextModel('../svm_experts/models/fcc-experts.model') logger.info('listing documents') flocs = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index'), 'r')] logger.info('... found {} documents'.format(len(flocs))) def get_json(filename): return json.load(open(os.path.join(settings.RAW_DIR, filename))) def get_text(jd): txt = jd.get('text', "") if txt: return asciiDammit(txt) else: return "" with open(os.path.join(settings.PERSIST_DIR, 'expert_predictions.csv'), 'w') as fout: writer = csv.writer(fout) for i, floc in enumerate(flocs): fname = os.path.basename(floc) if not i % 1000: logger.info('predicted {} documents'.format(i)) result = classifier.predict_single_text(get_text(get_json(fname)), svm_model) writer.writerow((fname, result.predicted_y, result.decvals[0], result.decvals[1]))
line.strip() for line in open( os.path.join(settings.PERSIST_DIR, 'document_index_part_two'), 'r') ] logger.info('... found {} documents'.format(len(doc_ids))) def get_json(filename): return json.load(open(os.path.join(settings.PROC_DIR, filename))) def get_text(jd): txt = jd.get('text', "") if txt: return asciiDammit(txt) else: return "" with open( os.path.join(settings.PERSIST_DIR, 'expert_predictions_part_two.csv'), 'w') as fout: writer = csv.writer(fout) for i, doc_id in enumerate(doc_ids): fname = '{}.json'.format(doc_id) if not i % 1000: logger.info('predicted {} documents'.format(i)) result = classifier.predict_single_text(get_text(get_json(fname)), svm_model) writer.writerow( (fname, result.predicted_y, result.decvals[0], result.decvals[1]))