def main(): from optparse import OptionParser # option usage="usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-r","--result",dest ='insertResult',action ='store_true', help="insert json result", default = False) parser.add_option("-t","--tokenize",dest ='insertToken',action ='store_true', help="insert tokenized rc pairs", default = False) (options,args) = parser.parse_args() insertResult = options.insertResult insertToken = options.insertToken CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) dir_path = os.path.join(settings.PROJECT_DIR,'result/raw/') if insertResult: # create db querys = [] querys.append('drop table if exists rc;') querys.append('create table rc(id int,pairs text, tuples text, coref text);') mydb.executeManyQuery(con,querys,False) # insert pairs query = 'insert into rc(id,pairs) values(%s, %s)' insert(con,os.path,join(dir_path,'result.sentence.json.txt'),query,'sen_pairs',False) # insert tuples query = 'update rc set tuples = %s where id = %s' insert(con,os.path.join(dir_path,'result.tuple.json.txt'),query,'pairs',True) # insert coref query = 'update rc set coref = %s where id = %s' insert(con,os.path.join(dir_path,'result.coref.json.txt'),query,'coref',True) if insertToken: querys = [] querys.append('drop table if exists tokenizedrc;') querys.append('create table tokenizedrc(id char(50),tknPair text);') mydb.executeManyQuery(con,querys,False) data = loader.Loader.load() pairs = loader.Loader.sent2pair(data) query = 'insert into tokenizedrc(id,tknPair) values(%s,%s);' insertTokenFunc(con,pairs,query)
def clean_review_clauses(): CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = 'select id,review_text,review_clauses from review where review_clauses is not null' records = mydb.executeQueryResult(con,query,False) idxs = [] for record in records: idx = record[0] review_text = record[1] review_clauses = record[2] review_clauses = review_clauses.replace('###','') s1,s2,jd=jaccard_distance(review_text,review_clauses) if jd<0.6: # we think this would be a bad one print jd,idx idxs.append(idx) query = 'update review set review_clauses = NULL where id = __idx__' for idx in idxs: query_n = query.replace('__idx__',str(idx)) mydb.executeQuery(con,query_n,False)
def insertCoref(fileName): con = mydb.getCon(CONN_STRING) json_date = open(fileName) data = json.load(json_date) i = 0 records = [] query = "insert into coref(id, replace) values(%s,%s);" for key in data: if i % 1000 == 0: print i i += 1 id = key txt = json.dumps(data[key]) records.append((id,txt)) if len(records) >= 100: try: mydb.executeQueryRecords(con, query,records, False) except psycopg2.DatabaseError, e: print 'Error %s' % e records = []
def merge(file): con = mydb.getCon(CONN_STRING) #iter = 1 json_date = open(file) map = {} totalRepl = 0 totalReplDone = 0 for entry in json_date: data = json.loads(entry) query = "select id, review_clauses from review where id = '" + data['id'] + "'" corefStr = data['review_text'] resultSet = mydb.executeQueryResult(con, query, False) #clauseStr = ''.join(resultSet[1]) if resultSet[0][1] == None: continue clauseStr = resultSet[0][1].decode('utf-8') clauseStr = re.sub(r"(\.+)", ".", clauseStr) corefStr = corefStr.replace("-RRB-", ")").replace("-LRB-", "(") corefStr = re.sub(r"(\.)+", ".", corefStr) corefLst = corefStr.split() i = 0 corefLst_2 = [] while i < len(corefLst): if "<--" in corefLst[i]: totalRepl += 1 str = '' j = i while not corefLst[j].endswith("-->"): str += corefLst[j] + ' ' j += 1 #print str #if j == i: str += corefLst[j] corefLst_2.append(str) i = j else: corefLst_2.append(corefLst[i]) i += 1 clauseLst = clauseStr.split() clauseLstClone = list(clauseLst) loc_clause = 0 loc_coref = 0 while loc_coref < len(corefLst_2): resCorefWord = getCorefWord(loc_coref, corefLst_2) if resCorefWord[0]: match = False word_coref = resCorefWord[1] loc_clause_cur = loc_clause while loc_clause_cur < len(clauseLst): word_clause = getClauseWord(loc_clause_cur, clauseLst) if word_clause == word_coref: i = 1 prevMatch = False prevClause = '' prevCoref = '' while loc_clause_cur - i >= 0 and i <= 3 and loc_coref - i >= 0: prevCoref = getCorefWord(loc_coref-i,corefLst_2)[1] + prevCoref prevClause = getClauseWord(loc_clause_cur-i,clauseLst) + prevClause i += 1 prevClause = prevClause.replace("`", "'") prevCoref= prevCoref.replace("`", "'") prevCoref= prevCoref.replace("\/", "/") if prevCoref.endswith(prevClause) or prevClause.endswith(prevCoref) or prevCoref.startswith(prevClause) or prevClause.startswith(prevCoref): prevMatch = True i = 1 nextMatch = False nextClause = '' nextCoref = '' while loc_clause_cur + i < len(clauseLst) and i <= 3 and loc_coref + i < len(corefLst_2): nextCoref = nextCoref + getCorefWord(loc_coref+i,corefLst_2)[1] nextClause = nextClause + getClauseWord(loc_clause_cur+i,clauseLst) i += 1 nextClause = nextClause.replace("`", "'") nextCoref = nextCoref.replace("`", "'") nextCoref = nextCoref.replace("\/", "/") if nextClause.startswith(nextCoref) or nextCoref.startswith(nextClause) or nextClause.endswith(nextCoref) or nextCoref.endswith(nextClause): nextMatch = True if prevMatch and nextMatch: repl = corefLst_2[loc_coref][corefLst_2[loc_coref].index("<--"):corefLst_2[loc_coref].index("-->")+3] match = match or True totalReplDone += 1 clauseLstClone[loc_clause_cur] += repl loc_clause_cur += 1 loc_coref += 1 loc_clause += 1 map[int(data['id'])] = genJSON(clauseLstClone) print totalReplDone, totalRepl, (totalReplDone + 0.0)/totalRepl with open('data.txt', 'w') as outfile: json.dump(map, outfile)
def clearTables(CONN_STRING): con = mydb.getCon(CONN_STRING) queries = list() queries.append('delete from coref') mydb.executeManyQuery(con, queries, False) con.close()
def createTables(CONN_STRING): con = mydb.getCon(CONN_STRING) queries = [] queries.append('create table coref(id int, replace text, primary key(id))') mydb.executeManyQuery(con, queries, False) con.close()
def rcCoref(input, output): con = mydb.getCon(CONN_STRING) json_date = open(input) total = 0 bMap = {} for entry in json_date: total += 1 data = json.loads(entry) iden = int(data['id']) query = "select replace from coref where id = '" + str(data['id']) + "'" resultSet = mydb.executeQueryResult(con, query, False) if len(resultSet) == 0: continue repl = json.loads(resultSet[0][0]) reacons = data['sen_pairs'] sMap = {} for reacon in reacons: reas = reacon[0] cons = reacon[1] reaMap = {} consMap = {} iRea = 0 for rea in reas: if str(rea[1]) in repl: rep = repl[str(rea[1])] for key in rep: if not rep[key] in reaMap: reaMap[rep[key]] = {rea[1]: (int(key),1)} else: if rea[1] in reaMap[rep[key]]: reaMap[rep[key]][rea[1]] = (min(reaMap[rep[key]][rea[1]][0], int(key)), reaMap[rep[key]][rea[1]][1] + 1) else: reaMap[rep[key]][rea[1]] = (int(key), 1) for cons in cons: if str(cons[1]) in repl: rep = repl[str(cons[1])] for key in rep: if not rep[key] in consMap: consMap[rep[key]] = {cons[1]: (int(key),1)} else: if cons[1] in consMap[rep[key]]: consMap[rep[key]][cons[1]] = (min(consMap[rep[key]][cons[1]][0], int(key)), consMap[rep[key]][cons[1]][1] + 1) else: consMap[rep[key]][cons[1]] = (int(key), 1) for key in reaMap: if key in consMap: for clause in reaMap[key]: if not key in sMap: sMap[key] = {} sMap[key][clause] = reaMap[key][clause] for clause in consMap[key]: sMap[key][clause] = consMap[key][clause] if not len(sMap) == 0: bMap[iden] = sMap outputRes(bMap, output)