Beispiel #1
0
def main():

    from optparse import OptionParser
   
    # option
    usage="usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-r","--result",dest ='insertResult',action ='store_true', help="insert json result", default = False)
    parser.add_option("-t","--tokenize",dest ='insertToken',action ='store_true', help="insert tokenized rc pairs", default = False)

    (options,args) = parser.parse_args()
    insertResult = options.insertResult
    insertToken = options.insertToken
    

    CONN_STRING = mydb.get_CONN()
    con = mydb.getCon(CONN_STRING)
    dir_path = os.path.join(settings.PROJECT_DIR,'result/raw/')

    if insertResult:

        # create db
        querys = []
        querys.append('drop table if exists rc;')
        querys.append('create table rc(id int,pairs text, tuples text, coref text);')
        mydb.executeManyQuery(con,querys,False)

        # insert pairs
        query =  'insert into rc(id,pairs) values(%s, %s)'
        insert(con,os.path,join(dir_path,'result.sentence.json.txt'),query,'sen_pairs',False)
        # insert tuples
        query = 'update rc set tuples = %s where id = %s'
        insert(con,os.path.join(dir_path,'result.tuple.json.txt'),query,'pairs',True)
        # insert coref
        query = 'update rc set coref = %s where id = %s'
        insert(con,os.path.join(dir_path,'result.coref.json.txt'),query,'coref',True)
    if insertToken:
        querys = []
        querys.append('drop table if exists tokenizedrc;')
        querys.append('create table tokenizedrc(id char(50),tknPair text);')
        mydb.executeManyQuery(con,querys,False)
        
        data = loader.Loader.load()
        pairs = loader.Loader.sent2pair(data)
        query = 'insert into tokenizedrc(id,tknPair) values(%s,%s);'
        insertTokenFunc(con,pairs,query)
Beispiel #2
0
def clean_review_clauses():
    CONN_STRING = mydb.get_CONN()
    con = mydb.getCon(CONN_STRING)
    query = 'select id,review_text,review_clauses from review where review_clauses is not null'
    records = mydb.executeQueryResult(con,query,False)
    idxs = []
    for record in records:
        idx = record[0]
        review_text = record[1]
        review_clauses = record[2]
        review_clauses = review_clauses.replace('###','')
        s1,s2,jd=jaccard_distance(review_text,review_clauses)
        if jd<0.6: # we think this would be a bad one
            print jd,idx
            idxs.append(idx)
    
    query = 'update review set review_clauses = NULL where id = __idx__'
    for idx in idxs:
        query_n = query.replace('__idx__',str(idx))
        mydb.executeQuery(con,query_n,False)
Beispiel #3
0
def insertCoref(fileName):
    con = mydb.getCon(CONN_STRING)
    json_date = open(fileName)
    data = json.load(json_date)
    
    i = 0
    records = []
    query = "insert into coref(id, replace) values(%s,%s);"
    for key in data:
        if i % 1000 == 0:
            print i
        i += 1
        
        id = key
        txt = json.dumps(data[key])
        records.append((id,txt))
        if len(records) >= 100:
            try:
                mydb.executeQueryRecords(con, query,records, False)
            except psycopg2.DatabaseError, e:
                print 'Error %s' % e
            records = []
Beispiel #4
0
def merge(file):
    con = mydb.getCon(CONN_STRING)
    #iter = 1
    json_date = open(file)
    map = {}
    totalRepl = 0
    totalReplDone = 0
          
    for entry in json_date:
        data = json.loads(entry)
        query = "select id, review_clauses from review where id = '" + data['id'] + "'"
        corefStr = data['review_text']
        resultSet = mydb.executeQueryResult(con, query, False)
        #clauseStr = ''.join(resultSet[1])
        
        if resultSet[0][1] == None:
            continue
        clauseStr = resultSet[0][1].decode('utf-8')
        clauseStr = re.sub(r"(\.+)", ".", clauseStr)
        corefStr = corefStr.replace("-RRB-", ")").replace("-LRB-", "(")
        corefStr = re.sub(r"(\.)+", ".", corefStr)
        corefLst = corefStr.split()
        i = 0
        corefLst_2 = []
        
        while i < len(corefLst):
            if "<--" in corefLst[i]:
                totalRepl += 1
                str = ''
                j = i
                while not corefLst[j].endswith("-->"):
                    str += corefLst[j] + ' '
                    j += 1
                    #print str
                #if j == i:
                str += corefLst[j]
                corefLst_2.append(str)
                i = j
            else:
                corefLst_2.append(corefLst[i])
            i += 1
        clauseLst = clauseStr.split()
        clauseLstClone = list(clauseLst)
        loc_clause = 0
        loc_coref = 0

        while loc_coref < len(corefLst_2):
            resCorefWord = getCorefWord(loc_coref, corefLst_2)
            if resCorefWord[0]:
                match = False
                word_coref = resCorefWord[1]
                loc_clause_cur = loc_clause
                while loc_clause_cur < len(clauseLst):                   
                    word_clause = getClauseWord(loc_clause_cur, clauseLst)
                    
                    if word_clause == word_coref:
                        i = 1
                        prevMatch = False
                        prevClause = ''
                        prevCoref = ''
                        
                        while loc_clause_cur - i >= 0 and i <= 3 and loc_coref - i >= 0:
                            prevCoref = getCorefWord(loc_coref-i,corefLst_2)[1] + prevCoref
                            prevClause = getClauseWord(loc_clause_cur-i,clauseLst) + prevClause
                            i += 1
                        prevClause = prevClause.replace("`", "'")
                        prevCoref= prevCoref.replace("`", "'")
                        prevCoref= prevCoref.replace("\/", "/")
                        if prevCoref.endswith(prevClause) or prevClause.endswith(prevCoref) or prevCoref.startswith(prevClause) or prevClause.startswith(prevCoref):
                            prevMatch = True
                        i = 1
                        nextMatch = False
                        nextClause = ''
                        nextCoref = ''
                        while loc_clause_cur + i < len(clauseLst) and i <= 3 and loc_coref + i < len(corefLst_2):
                            nextCoref = nextCoref + getCorefWord(loc_coref+i,corefLst_2)[1]
                            nextClause = nextClause + getClauseWord(loc_clause_cur+i,clauseLst)
                            i += 1
                        nextClause = nextClause.replace("`", "'")
                        nextCoref = nextCoref.replace("`", "'")
                        nextCoref = nextCoref.replace("\/", "/")
                        if nextClause.startswith(nextCoref) or nextCoref.startswith(nextClause) or nextClause.endswith(nextCoref) or nextCoref.endswith(nextClause):
                            nextMatch = True
                        if prevMatch and nextMatch:
                            repl = corefLst_2[loc_coref][corefLst_2[loc_coref].index("<--"):corefLst_2[loc_coref].index("-->")+3]
                            match = match or True
                            totalReplDone += 1
                            clauseLstClone[loc_clause_cur] += repl
                    loc_clause_cur += 1
            loc_coref += 1
            loc_clause += 1
        map[int(data['id'])] = genJSON(clauseLstClone)
    print totalReplDone, totalRepl, (totalReplDone + 0.0)/totalRepl
    with open('data.txt', 'w') as outfile:
        json.dump(map, outfile)
Beispiel #5
0
def clearTables(CONN_STRING):
    con = mydb.getCon(CONN_STRING)
    queries = list()
    queries.append('delete from coref')
    mydb.executeManyQuery(con, queries, False)
    con.close()
Beispiel #6
0
def createTables(CONN_STRING):
    con = mydb.getCon(CONN_STRING)
    queries = []
    queries.append('create table coref(id int, replace text, primary key(id))')
    mydb.executeManyQuery(con, queries, False)
    con.close()
Beispiel #7
0
def rcCoref(input, output):
    con = mydb.getCon(CONN_STRING)
    json_date = open(input)
    total = 0
    bMap = {}
    
    for entry in json_date:
        total += 1
        data = json.loads(entry)
        iden = int(data['id'])
        query = "select replace from coref  where id = '" + str(data['id']) + "'"
        resultSet = mydb.executeQueryResult(con, query, False)
        if len(resultSet) == 0:
            continue
        repl = json.loads(resultSet[0][0])
        reacons = data['sen_pairs']
        sMap = {}
        
        for reacon in reacons:
            reas = reacon[0]
            cons = reacon[1]
            reaMap = {}
            consMap = {}
            iRea = 0
            for rea in reas:
                if str(rea[1]) in repl:
                    rep = repl[str(rea[1])]             
                    for key in rep:                
                        if not rep[key] in reaMap:
                            reaMap[rep[key]] = {rea[1]: (int(key),1)}
                        else:                            
                            if rea[1] in reaMap[rep[key]]:                                
                                reaMap[rep[key]][rea[1]] = (min(reaMap[rep[key]][rea[1]][0], int(key)), reaMap[rep[key]][rea[1]][1] + 1)                                
                            else:
                                reaMap[rep[key]][rea[1]] = (int(key), 1)
                                
            for cons in cons:
                if str(cons[1]) in repl:
                    rep = repl[str(cons[1])]                    
                    for key in rep:                
                        if not rep[key] in consMap:
                            consMap[rep[key]] = {cons[1]: (int(key),1)}
                        else:
                            if cons[1] in consMap[rep[key]]:
                                consMap[rep[key]][cons[1]] = (min(consMap[rep[key]][cons[1]][0], int(key)), consMap[rep[key]][cons[1]][1] + 1)
                            else:
                                consMap[rep[key]][cons[1]] = (int(key), 1)
                                
            for key in reaMap:
                if key in consMap:
                    for clause in reaMap[key]:
                        if not key in sMap:
                            sMap[key] = {}
                        sMap[key][clause] = reaMap[key][clause]
                    for clause in consMap[key]:
                        sMap[key][clause] = consMap[key][clause]
                        
            if not len(sMap) == 0:
                bMap[iden] = sMap
                
    outputRes(bMap, output)