def calculate(type_name,lang): result = {} indexs={"EN":0,"ES":1,"RU":2,"FA":3} type_table_names=["type_en","type_es","type_ru"] property_table_names=["property_en","property_es","property_ru"] CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) rows=[] property_table_name='' type_table_name = '' if lang !='FA': index = indexs[lang] type_table_name = type_table_names[index] property_table_name = property_table_names[index] type_name = '<'+type_name+'>' query = "select entity from __type_table__ where type = '__type_name__'" query = query.replace('__type_table__',type_table_name) query = query.replace('__type_name__',type_name) rows = mydb.executeQueryResult(con,query,True) result['__total__']=len(rows) else: property_table_name = 'property_fa' query = "select distinct entity from property_fa where property = '<http://fa.dbpedia.org/property/type>' and value ='__type_name__' " query = query.replace('__type_name__',type_name) rows = mydb.executeQueryResult(con,query,True) result['__total__']=len(rows) temp_result={} print len(rows) i=0 for row in rows: entity = row[0] entity = entity.replace("'","''") query = "select property from __property_table__ where entity = '__entity_name__';" query = query.replace('__property_table__',property_table_name) query = query.replace('__entity_name__',entity) ps = mydb.executeQueryResult(con,query,False) for p in ps: if not p[0] in temp_result: temp_result[p[0]]=1 for p in temp_result: if not p in result: result[p]=0 result[p]+=1 temp_result={} i+=1 if i % 50 ==0: print i #print result return result
def main(): offset = 0 # change your output dir dir = 'evaluate' pfile = open(dir + '/evaluate.pairs.' + str(offset) + '.txt','w') tfile = open(dir + '/evaluate.tuples.'+ str(offset) + '.txt','w') cfile = open(dir + '/evaluate.corefs.'+ str(offset) + '.txt','w') CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) # change the amount of results, currently is 200 query = 'select * from rc order by id limit 200 offset 0' rows = mydb.executeQueryResult(con,query,False) for row in rows: id = int(row[0]) pairs = json.loads(row[1]) tuples = json.loads(row[2]) coref = None if row[3]: coref = json.loads(row[3]) # write the pairs for pair in pairs: reasons = pair[0] consequences = pair[1] for reason in reasons: pfile.write(str(id)+'\t'+reason[0]+'\n') for consequence in consequences: pfile.write(str(id)+'\t'+consequence[0]+'\n') # write the tuples: for t in tuples: reasons = t[0] consequences = t[1] for reason in reasons: tfile.write(str(id)+'\t'+tuple2str(reason[0])+'\n') for consequence in consequences: tfile.write(str(id)+'\t'+tuple2str(consequence[0])+'\n') # write the tuples if row[3]: corefPairs=rcreplace.mergeCoref(pairs,coref) for pair in corefPairs: reasons = pair[0] consequences = pair[1] for reason in reasons: cfile.write(str(id)+'\t'+reason+'\n') for consequence in consequences: cfile.write(str(id)+'\t'+consequence+'\n') pfile.close() tfile.close() cfile.close()
def loadAll(lang): global loaded if loaded is True: return # prepare db CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'cn_' + lang print 'load from database...' query = 'select * from ' + table_name records = mydb.executeQueryResult(con, query, False) for r in records: rel = r[0] start = None end = None pos = '' ll = mysplit(r[1].decode('utf8'), '/') if len(ll) >= 3: start = ll[2] ll = mysplit(r[2].decode('utf8'), '/') if len(ll) >= 3: end = ll[2] if len(ll) >= 4: pos = ll[3] rel = rel + pos # add start to end's neighbour if end not in data: data[end] = {} if rel not in data[end]: data[end][rel] = [] if start not in data[end][rel]: data[end][rel].append(start) # add end to start's neighbour if start not in data: data[start] = {} if rel not in data[start]: data[start][rel] = [] if end not in data[start][rel]: data[start][rel].append(end) print 'loading done!' loaded = True
def main(): # file to save #jfile = open('result.tuple.json.txt','w') file_path = os.path.join(settings.PROJECT_DIR,'result/raw/result.sentence.json.txt') jsfile = open(file_path,'w') #file = open('result.txt','w') #jhfile = open('result.jh.txt','w') # db CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = 'select id , review_clauses from review where review_clauses is not null order by id' rows = mydb.executeQueryResult(con,query,True) for row in rows: id = row[0] review = row[1] if not review: continue review = review.decode('utf-8') clauses = review.split('###') tpairs = processReview(clauses) if len(tpairs) == 0: continue #jfile.write(json.dumps({'id':id,'pairs':pairs})+'\n') jsfile.write(json.dumps({'id':id,'sen_pairs':tpairs})+'\n') #file.write('id:'+str(id)+'\n') #jhfile.write('id:'+str(id)+'\n') # for tpair in tpairs: # file.write('Reasons:'+repr(tpair[0])+'\n') # file.write('Consequences:'+ repr(tpair[1])+'\n') # for pair in pairs: # jhfile.write('Reasons:\n') # jhfile.write(repr(pair[0][0][0]['subj'])+' ') # jhfile.write(repr(pair[0][0][0]['verb'])+' ') # jhfile.write(repr(pair[0][0][0]['dobj'])+' ') # jhfile.write(repr(pair[0][0][0]['iobj'])+'\n') # jhfile.write('Consequences:\n') # jhfile.write(repr(pair[1][0][0]['subj'])+' ') # jhfile.write(repr(pair[1][0][0]['verb'])+' ') # jhfile.write(repr(pair[1][0][0]['dobj'])+' ') # jhfile.write(repr(pair[1][0][0]['iobj'])+'\n') jsfile.close()
def loadAll(lang): global loaded if loaded: return CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'translation_' + lang print table_name print 'loading translation from db...' sa.loadAll() query = 'select * from ' + table_name records = mydb.executeQueryResult(con, query, False) for r in records: ll = mysplit(r[1].decode('utf8'), '/') word = ll[-1] ll = mysplit(r[2].decode('utf8'), '/') if len(ll) < 3: continue en_word = ll[2] if word not in data: data[word] = en_word print 'loading done!' loaded = True
def loadAll(lang): global loaded if loaded: return CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'translation_'+lang print table_name print 'loading translation from db...' sa.loadAll() query = 'select * from '+table_name records = mydb.executeQueryResult(con,query,False) for r in records: ll = mysplit(r[1].decode('utf8'),'/') word = ll[-1] ll = mysplit(r[2].decode('utf8'),'/') if len(ll)<3: continue en_word = ll[2] if not word in data: data[word] = en_word print 'loading done!' loaded = True
#!/usr/bin/python # -*- coding: utf-8 -*- import mydb CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = "select distinct value from property_fa where property = '<http://fa.dbpedia.org/property/type>'" rows = mydb.executeQueryResult(con, query, True) file = open('types_fa.txt', 'w') print len(rows) for row in rows: file.write(row[0] + '\n') file.close()
#!/usr/bin/python # -*- coding: utf-8 -*- import mydb CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = "select distinct value from property_fa where property = '<http://fa.dbpedia.org/property/type>'" rows = mydb.executeQueryResult(con, query, True) file = open("types_fa.txt", "w") print len(rows) for row in rows: file.write(row[0] + "\n") file.close()
def processSingle(start, end, fileName,dirName,p): fileName = os.path.abspath(fileName) #totalOneTime = 1000#increment each time totalOneTime = end con = mydb.getCon(CONN_STRING) #nltk.download() sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') #text = nltk.corpus.abc.raw('science.txt') iter = start per = 1 while iter * per < totalOneTime: print iter query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per) resultSet = mydb.executeQueryResult(con, query, False) sentLens = list() #fileName = 'sents.txt' file = open(fileName, 'w') entry = resultSet[0] try: sents = sent_tokenizer.tokenize(entry[1]) except UnicodeDecodeError, e: iter += 1 continue #sentLens.append([entry[0], len(sents)]) # fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt' for sent in sents: if p.match(sent): print sent sent = 'Special' elif len(sent.split()) > 70: print sent sent = 'longsentence' file.write('<s> ' + sent + ' </s> \n') file.close() os.system('perl '+dirName+'/spade.pl ' + fileName) outputFileName = fileName + '.chp.edu.fmt'; with open(outputFileName) as f: content = f.readlines() loc = 0 #print len(content) clauses = list() while loc < len(content): subLen = int(content[loc]) loc += 1 j = 0 while j < subLen: j += 1 if len(content[loc].split()) > 2: clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''")) loc += 1 #print subLen, j, loc if len(clauses) < 1: iter += 1 continue strClauses = clauses[0] for clause in clauses[1:]: strClauses += '###' + clause query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(entry[0])+"'" mydb.executeQuery(con, query, False) sentLens = list() iter += 1
def processBatch(p): totalOneTime = 300 #increment each time con = mydb.getCon(CONN_STRING) #nltk.download() sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') #text = nltk.corpus.abc.raw('science.txt') iter = 237 per = 1 while iter * per < totalOneTime: query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per) resultSet = mydb.executeQueryResult(con, query, False) sentLens = list() fileName = 'sents.txt' file = open(fileName, 'w') for entry in resultSet: sents = sent_tokenizer.tokenize(entry[1]) sentLens.append([entry[0], len(sents)]) # fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt' for sent in sents: if p.match(sent): print sent sent = 'Special' elif len(sent.split()) > 70: print sent sent = 'longsentence' file.write('<s> ' + sent + ' </s> \n') file.close() print sentLens os.system('perl spade.pl ' + fileName) outputFileName = 'sents.txt.chp.edu.fmt'; #outputFile = open(outputFileName, 'r') with open(outputFileName) as f: content = f.readlines() loc = 0 queries = list() print len(content) for lens in sentLens: i = 0 clauses = list() while i < lens[1]: i += 1 #print lens[0], content[loc] subLen = int(content[loc]) loc += 1 j = 0 print subLen while j < subLen: print j j += 1 print content[loc], if len(content[loc].split()) > 2: clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''")) loc += 1 print subLen, j, loc #print clauses strClauses = clauses[0] for clause in clauses[1:]: strClauses += '###' + clause query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(lens[0])+"'" #print query mydb.executeQuery(con, query, False) #queries.append(query) #print queries #mydb.executeManyQuery(con, queries, False) sentLens = list() iter += 1 #sents = sent_tokenizer.tokenize(text) #pprint(sents[1:2]) con.close()