def insertReviews(fileName): con = mydb.getCon(CONN_STRING) json_date = open(fileName,'r') # date = json.load(json_date) total = 0 fail = 0 succ = 0 for entry in json_date: total += 1 #print entry data = json.loads(entry) #print date['business_id'] succ += 1 query = "insert into review(id, business_id, user_id, review_text, review_date) values('" query += str(succ) + "', '" query += data['business_id'] + "', '" query += data['user_id'] + "', '" query += data['text'].replace("'","''") + "', '" query += data['date'].replace("'","''") + "')" #query += data['votes'] + "')" try: mydb.executeQuery(con,query, False) except psycopg2.DatabaseError, e: fail += 1 succ -= 1 print 'Error %s' % e
def createTables(CONN_STRING): con = mydb.getCon(CONN_STRING) queries = list() queries.append('create table users(user_id varchar(50), name varchar(100), review_count int, primary key(user_id))') queries.append('create table business(business_id varchar(50), name varchar(100), full_address varchar(100), city varchar(20), state varchar(20), review_count int, categories varchar(20), primary key(business_id))') queries.append('create table review(id int, business_id varchar(50), user_id varchar(50), review_text text, review_date date, review_clauses text, primary key(id))') mydb.executeManyQuery(con, queries, True)
def dropTables(CONN_STRING): con = mydb.getCon(CONN_STRING) queries = list() queries.append('drop table if exists review') queries.append('drop table if exists users') queries.append('drop table if exists business') mydb.executeManyQuery(con, queries, True)
def clearTables(CONN_STRING): con = mydb.getCon(CONN_STRING) queries = list() queries.append('delete from review') queries.append('delete from users') queries.append('delete from business') mydb.executeManyQuery(con, queries, True)
def main(): offset = 0 # change your output dir dir = 'evaluate' pfile = open(dir + '/evaluate.pairs.' + str(offset) + '.txt','w') tfile = open(dir + '/evaluate.tuples.'+ str(offset) + '.txt','w') cfile = open(dir + '/evaluate.corefs.'+ str(offset) + '.txt','w') CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) # change the amount of results, currently is 200 query = 'select * from rc order by id limit 200 offset 0' rows = mydb.executeQueryResult(con,query,False) for row in rows: id = int(row[0]) pairs = json.loads(row[1]) tuples = json.loads(row[2]) coref = None if row[3]: coref = json.loads(row[3]) # write the pairs for pair in pairs: reasons = pair[0] consequences = pair[1] for reason in reasons: pfile.write(str(id)+'\t'+reason[0]+'\n') for consequence in consequences: pfile.write(str(id)+'\t'+consequence[0]+'\n') # write the tuples: for t in tuples: reasons = t[0] consequences = t[1] for reason in reasons: tfile.write(str(id)+'\t'+tuple2str(reason[0])+'\n') for consequence in consequences: tfile.write(str(id)+'\t'+tuple2str(consequence[0])+'\n') # write the tuples if row[3]: corefPairs=rcreplace.mergeCoref(pairs,coref) for pair in corefPairs: reasons = pair[0] consequences = pair[1] for reason in reasons: cfile.write(str(id)+'\t'+reason+'\n') for consequence in consequences: cfile.write(str(id)+'\t'+consequence+'\n') pfile.close() tfile.close() cfile.close()
def calculate(type_name,lang): result = {} indexs={"EN":0,"ES":1,"RU":2,"FA":3} type_table_names=["type_en","type_es","type_ru"] property_table_names=["property_en","property_es","property_ru"] CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) rows=[] property_table_name='' type_table_name = '' if lang !='FA': index = indexs[lang] type_table_name = type_table_names[index] property_table_name = property_table_names[index] type_name = '<'+type_name+'>' query = "select entity from __type_table__ where type = '__type_name__'" query = query.replace('__type_table__',type_table_name) query = query.replace('__type_name__',type_name) rows = mydb.executeQueryResult(con,query,True) result['__total__']=len(rows) else: property_table_name = 'property_fa' query = "select distinct entity from property_fa where property = '<http://fa.dbpedia.org/property/type>' and value ='__type_name__' " query = query.replace('__type_name__',type_name) rows = mydb.executeQueryResult(con,query,True) result['__total__']=len(rows) temp_result={} print len(rows) i=0 for row in rows: entity = row[0] entity = entity.replace("'","''") query = "select property from __property_table__ where entity = '__entity_name__';" query = query.replace('__property_table__',property_table_name) query = query.replace('__entity_name__',entity) ps = mydb.executeQueryResult(con,query,False) for p in ps: if not p[0] in temp_result: temp_result[p[0]]=1 for p in temp_result: if not p in result: result[p]=0 result[p]+=1 temp_result={} i+=1 if i % 50 ==0: print i #print result return result
def loadAll(lang): global loaded if loaded is True: return # prepare db CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'cn_' + lang print 'load from database...' query = 'select * from ' + table_name records = mydb.executeQueryResult(con, query, False) for r in records: rel = r[0] start = None end = None pos = '' ll = mysplit(r[1].decode('utf8'), '/') if len(ll) >= 3: start = ll[2] ll = mysplit(r[2].decode('utf8'), '/') if len(ll) >= 3: end = ll[2] if len(ll) >= 4: pos = ll[3] rel = rel + pos # add start to end's neighbour if end not in data: data[end] = {} if rel not in data[end]: data[end][rel] = [] if start not in data[end][rel]: data[end][rel].append(start) # add end to start's neighbour if start not in data: data[start] = {} if rel not in data[start]: data[start][rel] = [] if end not in data[start][rel]: data[start][rel].append(end) print 'loading done!' loaded = True
def processTranslation(lang,dirPath): # prepare DB CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'translation_'+lang create_table(con, table_name) #process and insert file sum = 0 dirPath = os.path.abspath(dirPath) file = 'wiktionary.csv' print file sum+=insertTranslation(con,table_name,os.path.join(dirPath,file),lang) print sum
def processTranslation(lang, dirPath): # prepare DB CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'translation_' + lang create_table(con, table_name) #process and insert file sum = 0 dirPath = os.path.abspath(dirPath) file = 'wiktionary.csv' print file sum += insertTranslation(con, table_name, os.path.join(dirPath, file), lang) print sum
def main(): # file to save #jfile = open('result.tuple.json.txt','w') file_path = os.path.join(settings.PROJECT_DIR,'result/raw/result.sentence.json.txt') jsfile = open(file_path,'w') #file = open('result.txt','w') #jhfile = open('result.jh.txt','w') # db CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = 'select id , review_clauses from review where review_clauses is not null order by id' rows = mydb.executeQueryResult(con,query,True) for row in rows: id = row[0] review = row[1] if not review: continue review = review.decode('utf-8') clauses = review.split('###') tpairs = processReview(clauses) if len(tpairs) == 0: continue #jfile.write(json.dumps({'id':id,'pairs':pairs})+'\n') jsfile.write(json.dumps({'id':id,'sen_pairs':tpairs})+'\n') #file.write('id:'+str(id)+'\n') #jhfile.write('id:'+str(id)+'\n') # for tpair in tpairs: # file.write('Reasons:'+repr(tpair[0])+'\n') # file.write('Consequences:'+ repr(tpair[1])+'\n') # for pair in pairs: # jhfile.write('Reasons:\n') # jhfile.write(repr(pair[0][0][0]['subj'])+' ') # jhfile.write(repr(pair[0][0][0]['verb'])+' ') # jhfile.write(repr(pair[0][0][0]['dobj'])+' ') # jhfile.write(repr(pair[0][0][0]['iobj'])+'\n') # jhfile.write('Consequences:\n') # jhfile.write(repr(pair[1][0][0]['subj'])+' ') # jhfile.write(repr(pair[1][0][0]['verb'])+' ') # jhfile.write(repr(pair[1][0][0]['dobj'])+' ') # jhfile.write(repr(pair[1][0][0]['iobj'])+'\n') jsfile.close()
def processConceptNet(lang,dirPath): # prepare DB CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'cn_'+lang create_table(con, table_name) #process and insert file sum = 0 dirPath = os.path.abspath(dirPath) for file in os.listdir(dirPath): if file.endswith('.csv'): if '_zh_' in file or '_nadya_' in file: continue #if file == 'dbpedia.9.csv': else: print file sum+=insert_records(con,table_name,os.path.join(dirPath,file),lang) print sum
def processConceptNet(lang, dirPath): # prepare DB CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'cn_' + lang create_table(con, table_name) #process and insert file sum = 0 dirPath = os.path.abspath(dirPath) for file in os.listdir(dirPath): if file.endswith('.csv'): if '_zh_' in file or '_nadya_' in file: continue #if file == 'dbpedia.9.csv': else: print file sum += insert_records(con, table_name, os.path.join(dirPath, file), lang) print sum
def loadAll(lang): global loaded if loaded: return CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'translation_' + lang print table_name print 'loading translation from db...' sa.loadAll() query = 'select * from ' + table_name records = mydb.executeQueryResult(con, query, False) for r in records: ll = mysplit(r[1].decode('utf8'), '/') word = ll[-1] ll = mysplit(r[2].decode('utf8'), '/') if len(ll) < 3: continue en_word = ll[2] if word not in data: data[word] = en_word print 'loading done!' loaded = True
def loadAll(lang): global loaded if loaded: return CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'" con = mydb.getCon(CONN_STRING) table_name = 'translation_'+lang print table_name print 'loading translation from db...' sa.loadAll() query = 'select * from '+table_name records = mydb.executeQueryResult(con,query,False) for r in records: ll = mysplit(r[1].decode('utf8'),'/') word = ll[-1] ll = mysplit(r[2].decode('utf8'),'/') if len(ll)<3: continue en_word = ll[2] if not word in data: data[word] = en_word print 'loading done!' loaded = True
def processBatch(p): totalOneTime = 300 #increment each time con = mydb.getCon(CONN_STRING) #nltk.download() sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') #text = nltk.corpus.abc.raw('science.txt') iter = 237 per = 1 while iter * per < totalOneTime: query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per) resultSet = mydb.executeQueryResult(con, query, False) sentLens = list() fileName = 'sents.txt' file = open(fileName, 'w') for entry in resultSet: sents = sent_tokenizer.tokenize(entry[1]) sentLens.append([entry[0], len(sents)]) # fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt' for sent in sents: if p.match(sent): print sent sent = 'Special' elif len(sent.split()) > 70: print sent sent = 'longsentence' file.write('<s> ' + sent + ' </s> \n') file.close() print sentLens os.system('perl spade.pl ' + fileName) outputFileName = 'sents.txt.chp.edu.fmt'; #outputFile = open(outputFileName, 'r') with open(outputFileName) as f: content = f.readlines() loc = 0 queries = list() print len(content) for lens in sentLens: i = 0 clauses = list() while i < lens[1]: i += 1 #print lens[0], content[loc] subLen = int(content[loc]) loc += 1 j = 0 print subLen while j < subLen: print j j += 1 print content[loc], if len(content[loc].split()) > 2: clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''")) loc += 1 print subLen, j, loc #print clauses strClauses = clauses[0] for clause in clauses[1:]: strClauses += '###' + clause query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(lens[0])+"'" #print query mydb.executeQuery(con, query, False) #queries.append(query) #print queries #mydb.executeManyQuery(con, queries, False) sentLens = list() iter += 1 #sents = sent_tokenizer.tokenize(text) #pprint(sents[1:2]) con.close()
def main(): from optparse import OptionParser # option usage="usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-t","--tfile", dest='type_filePath',help="Type file path") parser.add_option("-p","--pfile", dest='property_filePath',help="Property file path") parser.add_option("-l","--lang", dest='lang',help="language:EN,ES,RU,FA") parser.add_option("-d","--debug",dest = "debug" , action = "store_true", help="output debug info, default is false", default = False) (options,args) = parser.parse_args() lang = options.lang tfilePath = options.type_filePath pfilePath = options.property_filePath debug = options.debug indexs={"EN":0,"ES":1,"RU":2,"FA":3} type_table_names=["type_en","type_es","type_ru","type_fa"] property_table_names=["property_en","property_es","property_ru","property_fa"] index=indexs[lang] type_table_name = type_table_names[index] property_table_name = property_table_names[index] CONN_STRING=mydb.get_CONN() con = mydb.getCon(CONN_STRING) # create table querys=[] querys.append( 'DROP TABLE IF EXISTS '+type_table_name+';' ) querys.append('DROP TABLE IF EXISTS '+property_table_name+';') querys.append('CREATE TABLE '+type_table_name+'(entity varchar,type varchar);') querys.append('CREATE TABLE '+property_table_name+'(entity varchar,property varchar,value varchar);') mydb.executeManyQuery(con,querys,debug) # process the data tfile = open(tfilePath,'r') pfile = open(pfilePath,'r') # for the type file query_t='insert into __table_name__(entity,type) VALUES(%s, %s)' query_t=query_t.replace('__table_name__',type_table_name) processFile(con,query_t,tfile,True) # for property table query_p='insert into __table_name__(entity,property,value) VALUES(%s, %s, %s)' query_p=query_p.replace('__table_name__',property_table_name) processFile(con,query_p,pfile,False) # build index querys=[] query = 'create index __indexName__ on __index__;' querys.append(query.replace('__indexName__',type_table_name+'_entity').replace('__index__',type_table_name+'(entity)')) querys.append(query.replace('__indexName__',type_table_name+'_type').replace('__index__',type_table_name+'(type)')) querys.append(query.replace('__indexName__',property_table_name+'_entity').replace('__index__',property_table_name+'(entity)')) querys.append(query.replace('__indexName__',property_table_name+'_property').replace('__index__',property_table_name+'(property)')) mydb.executeManyQuery(con,querys,debug) # close db mydb.closeCon(con)
def processSingle(start, end, fileName,dirName,p): fileName = os.path.abspath(fileName) #totalOneTime = 1000#increment each time totalOneTime = end con = mydb.getCon(CONN_STRING) #nltk.download() sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') #text = nltk.corpus.abc.raw('science.txt') iter = start per = 1 while iter * per < totalOneTime: print iter query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per) resultSet = mydb.executeQueryResult(con, query, False) sentLens = list() #fileName = 'sents.txt' file = open(fileName, 'w') entry = resultSet[0] try: sents = sent_tokenizer.tokenize(entry[1]) except UnicodeDecodeError, e: iter += 1 continue #sentLens.append([entry[0], len(sents)]) # fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt' for sent in sents: if p.match(sent): print sent sent = 'Special' elif len(sent.split()) > 70: print sent sent = 'longsentence' file.write('<s> ' + sent + ' </s> \n') file.close() os.system('perl '+dirName+'/spade.pl ' + fileName) outputFileName = fileName + '.chp.edu.fmt'; with open(outputFileName) as f: content = f.readlines() loc = 0 #print len(content) clauses = list() while loc < len(content): subLen = int(content[loc]) loc += 1 j = 0 while j < subLen: j += 1 if len(content[loc].split()) > 2: clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''")) loc += 1 #print subLen, j, loc if len(clauses) < 1: iter += 1 continue strClauses = clauses[0] for clause in clauses[1:]: strClauses += '###' + clause query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(entry[0])+"'" mydb.executeQuery(con, query, False) sentLens = list() iter += 1
def main(): from optparse import OptionParser # option usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-t", "--tfile", dest='type_filePath', help="Type file path") parser.add_option("-p", "--pfile", dest='property_filePath', help="Property file path") parser.add_option("-l", "--lang", dest='lang', help="language: EN, ES, RU, FA") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debug info, default is false", default=False) (options, args) = parser.parse_args() lang = options.lang tfilePath = options.type_filePath pfilePath = options.property_filePath debug = options.debug indexes = {"EN": 0, "ES": 1, "RU": 2, "FA": 3} type_table_names = ["type_en", "type_es", "type_ru", "type_fa"] property_table_names = [ "property_en", "property_es", "property_ru", "property_fa" ] index = indexes[lang] type_table_name = type_table_names[index] property_table_name = property_table_names[index] CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) # create table queries = [] queries.append('DROP TABLE IF EXISTS ' + type_table_name + ';') queries.append('DROP TABLE IF EXISTS ' + property_table_name + ';') queries.append('CREATE TABLE ' + type_table_name + '(entity varchar,type varchar);') queries.append('CREATE TABLE ' + property_table_name + '(entity varchar,property varchar,value varchar);') mydb.executeManyQuery(con, queries, debug) # process the data tfile = open(tfilePath, 'r') pfile = open(pfilePath, 'r') # for the type file query_t = 'insert into __table_name__(entity,type) VALUES(%s, %s)' query_t = query_t.replace('__table_name__', type_table_name) processFile(con, query_t, tfile, True) # for property table query_p = 'insert into __table_name__(entity,property,value) VALUES(%s, %s, %s)' query_p = query_p.replace('__table_name__', property_table_name) processFile(con, query_p, pfile, False) # build index queries = [] query = 'create index __indexName__ on __index__;' queries.append( query.replace('__indexName__', type_table_name + '_entity').replace( '__index__', type_table_name + '(entity)')) queries.append( query.replace('__indexName__', type_table_name + '_type').replace( '__index__', type_table_name + '(type)')) queries.append( query.replace('__indexName__', property_table_name + '_entity').replace( '__index__', property_table_name + '(entity)')) queries.append( query.replace('__indexName__', property_table_name + '_property').replace( '__index__', property_table_name + '(property)')) mydb.executeManyQuery(con, queries, debug) # close db mydb.closeCon(con)
#!/usr/bin/python # -*- coding: utf-8 -*- import mydb CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = "select distinct value from property_fa where property = '<http://fa.dbpedia.org/property/type>'" rows = mydb.executeQueryResult(con, query, True) file = open('types_fa.txt', 'w') print len(rows) for row in rows: file.write(row[0] + '\n') file.close()
import mydb import sys typeFile = open('es_property.txt', 'r') transFile = open('es_property_translate.txt', 'r') CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) while True: type = typeFile.readline() if not type: break trans = transFile.readline() type = type.strip() trans = trans.strip() query = "update statistic_es set native_property = '__trans__' where property = '__type__' " query = query.replace('__trans__', trans) query = query.replace('__type__', type) mydb.executeQuery(con, query, True)