def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) writer_sa.write(edge) else: writer_sa.write(edge) writer_core.close() writer_sa.close()
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
if score <= 0: counts['low score'] += 1 weak_out.write(line) continue count += 1 counts['success'] += 1 good_out.write(line) if make_json: left = make_concept_uri(unicode(left), 'en') right = make_concept_uri(unicode(right), 'en') edge = make_edge(rel, left, right, '/d/verbosity', '/l/CC/By', sources, surfaceText=text, weight = score/10.0) writer.write(edge) if make_json: writer.close() flag_out.close() good_out.close() weak_out.close() similar_out.close() simout = open('data/output/similarity-scores.txt', 'w') for sim in text_similarities: print >> simout, sim simout.close()
for line in codecs.open('data/flat/CORE', encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weight) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, weight in assertions.iteritems(): if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' relation, start, end, context, weight = assertions[uri] edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: # writer_sa.write(edge) writer_core.close() #writer_sa.close()
score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000 if score <= 0: counts['low score'] += 1 weak_out.write(line) continue count += 1 counts['success'] += 1 good_out.write(line) if make_json: edge = make_edge(rel, left, right, '/d/verbosity', '/l/CC/By', sources, surfaceText=text, weight = score/10.0) writer.write(edge) if make_json: writer.close() flag_out.close() good_out.close() weak_out.close() similar_out.close() simout = open('data/output/similarity-scores.txt', 'w') for sim in text_similarities: print >> simout, sim simout.close()
if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, weight in assertions.iteritems(): if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' relation, start, end, context, weight = assertions[uri] edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: # writer_sa.write(edge) writer_core.close() #writer_sa.close()