def main(): from optparse import OptionParser # option usage="usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-r","--result",dest ='insertResult',action ='store_true', help="insert json result", default = False) parser.add_option("-t","--tokenize",dest ='insertToken',action ='store_true', help="insert tokenized rc pairs", default = False) (options,args) = parser.parse_args() insertResult = options.insertResult insertToken = options.insertToken CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) dir_path = os.path.join(settings.PROJECT_DIR,'result/raw/') if insertResult: # create db querys = [] querys.append('drop table if exists rc;') querys.append('create table rc(id int,pairs text, tuples text, coref text);') mydb.executeManyQuery(con,querys,False) # insert pairs query = 'insert into rc(id,pairs) values(%s, %s)' insert(con,os.path,join(dir_path,'result.sentence.json.txt'),query,'sen_pairs',False) # insert tuples query = 'update rc set tuples = %s where id = %s' insert(con,os.path.join(dir_path,'result.tuple.json.txt'),query,'pairs',True) # insert coref query = 'update rc set coref = %s where id = %s' insert(con,os.path.join(dir_path,'result.coref.json.txt'),query,'coref',True) if insertToken: querys = [] querys.append('drop table if exists tokenizedrc;') querys.append('create table tokenizedrc(id char(50),tknPair text);') mydb.executeManyQuery(con,querys,False) data = loader.Loader.load() pairs = loader.Loader.sent2pair(data) query = 'insert into tokenizedrc(id,tknPair) values(%s,%s);' insertTokenFunc(con,pairs,query)
def clean_review_clauses(): CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = 'select id,review_text,review_clauses from review where review_clauses is not null' records = mydb.executeQueryResult(con,query,False) idxs = [] for record in records: idx = record[0] review_text = record[1] review_clauses = record[2] review_clauses = review_clauses.replace('###','') s1,s2,jd=jaccard_distance(review_text,review_clauses) if jd<0.6: # we think this would be a bad one print jd,idx idxs.append(idx) query = 'update review set review_clauses = NULL where id = __idx__' for idx in idxs: query_n = query.replace('__idx__',str(idx)) mydb.executeQuery(con,query_n,False)
return (False, token) def getClauseWord(loc_clause_cur, clauseLst): word_clause = clauseLst[loc_clause_cur] if word_clause.endswith('###'): word_clause = clauseLst[loc_clause_cur][0 : clauseLst[loc_clause_cur].index('###')] elif word_clause.startswith('###'): word_clause = clauseLst[loc_clause_cur][clauseLst[loc_clause_cur].index('###')+3 : ] else: word_clause = clauseLst[loc_clause_cur] return word_clause if __name__ == '__main__': from optparse import OptionParser # option usage="usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-f","--file",dest ='file', help="input file") (options,args) = parser.parse_args() file = options.file CONN_STRING = mydb.get_CONN() merge(file)