def metaDataExtractor(groupedUtterances, markers, corpusType=''): results = [] for i, convo in enumerate(groupedUtterances): if (i % 2500 is 10): logger1.log("On " + str(i) + " of " + str(len(groupedUtterances))) toAppend = findMarkersInConvo(markers, convo) toAppend = addFeats(toAppend, convo[0], True, corpusType) results.append(toAppend) return results
def metaDataExtractor(groupedUtterances, markers,corpusType=''): results = [] for i, convo in enumerate(groupedUtterances): if(i % 2500 is 10): logger1.log("On " + str(i) + " of " + str(len(groupedUtterances))) toAppend = findMarkersInConvo(markers,convo) toAppend = addFeats(toAppend,convo[0],True,corpusType) results.append(toAppend) return results
def runFormula(results, markers, smoothing,corpusType): toReturn = [] categories = allMarkers(markers) for i, result in enumerate(results): if(i % 1000 is 10): logger1.log("On result " + str(i) + " of " + str(len(results))) for j, category in enumerate(categories): toAppend = createAlignmentDict(category,result,smoothing,corpusType) if toAppend is not None: toReturn.append(toAppend) toReturn = sorted(toReturn, key=lambda k: (k["speakerId"],k["replierId"],k["category"])) return toReturn
def runFormula(results, markers, smoothing, corpusType): toReturn = [] categories = allMarkers(markers) for i, result in enumerate(results): if (i % 1000 is 10): logger1.log("On result " + str(i) + " of " + str(len(results))) for j, category in enumerate(categories): toAppend = createAlignmentDict(category, result, smoothing, corpusType) if toAppend is not None: toReturn.append(toAppend) toReturn = sorted(toReturn, key=lambda k: (k["speakerId"], k["replierId"], k["category"])) return toReturn
def writeFile(results, outputFile, shouldWriteHeader): if len(results) == 0: logger1.log("No results to write =(") return toWrite = [] header = sorted(list(results[0].keys())) for row in results: toAppend = [] for key in header: toAppend.append(row[key]) toWrite.append(toAppend) if shouldWriteHeader: with open(outputFile, "w", newline='') as f: writer = csv.writer(f) writer.writerows([header]) f.close() with open(outputFile, "a", newline='') as f: writer = csv.writer(f) writer.writerows(toWrite) f.close()
def shuffleUtterances(utterances, shuffleIds, shuffleTweets, shuffleTokens, combineMsgReply): replyUserIds = [] msgUserIds = [] replyTweets = [] msgTweets = [] allReplyTokens = [] allMsgTokens = [] replyLengths = [] msgLengths = [] for i, utterance in enumerate(utterances): if (i % 10000 is 0): logger1.log("Adding to utterances " + str(i) + " of " + str(len(utterances))) msgUserIds.append(utterance["msgUserId"]) msgTweets.append(utterance["msgTokens"]) allMsgTokens.extend(utterance["msgTokens"]) msgLengths.append(len(utterance["msgTokens"])) if not combineMsgReply: #if we're shuffling msgs and replies together, put everything in msgs replyUserIds.append(utterance["replyUserId"]) replyTweets.append(utterance["replyTokens"]) allReplyTokens.extend(utterance["replyTokens"]) replyLengths.append(len(utterance["replyTokens"])) else: msgUserIds.append(utterance["replyUserId"]) msgTweets.append(utterance["replyTokens"]) allMsgTokens.extend(utterance["replyTokens"]) msgLengths.append(len(utterance["replyTokens"])) shuffle(msgUserIds) shuffle(msgTweets) shuffle(allMsgTokens) if not combineMsgReply: shuffle(replyUserIds) shuffle(replyTweets) shuffle(allReplyTokens) else: shuffle(msgLengths ) #only shuffle msgLengths if we're combining msgs and replies replyMarkerCount = 0 msgMarkerCount = 0 msgLengthsNew = [] replyLengthsNew = [] for i, utterance in enumerate(utterances): utterances[i]["msg"] = "" utterances[i]["reply"] = "" if (shuffleIds): if not combineMsgReply: utterances[i]["msgUserId"] = msgUserIds[i] utterances[i]["replyUserId"] = replyUserIds[i] else: utterances[i]["msgUserId"] = msgUserIds[2 * i] utterances[i]["replyUserId"] = msgUserIds[2 * i + 1] if (shuffleTweets): if not combineMsgReply: utterances[i]["msgTokens"] = msgTweets[i] utterances[i]["replyTokens"] = replyTweets[i] else: utterances[i]["msgTokens"] = msgTweets[2 * i] utterances[i]["replyTokens"] = msgTweets[2 * i + 1] if (shuffleTokens): if not combineMsgReply: utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:( msgMarkerCount + msgLengths[i])] msgMarkerCount += msgLengths[i] utterances[i]["replyTokens"] = allReplyTokens[ replyMarkerCount:(replyMarkerCount + replyLengths[i])] replyMarkerCount += replyLengths[i] else: utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:( msgMarkerCount + msgLengths[2 * i])] msgMarkerCount += msgLengths[2 * i] msgLengthsNew.append(msgLengths[2 * i]) utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:( msgMarkerCount + msgLengths[2 * i + 1])] msgMarkerCount += msgLengths[2 * i + 1] replyLengthsNew.append(msgLengths[2 * i + 1]) utterances[i]["convId"] = (utterances[i]["msgUserId"], utterances[i]["replyUserId"]) return utterances
msgMarkerCount += msgLengths[2 * i] msgLengthsNew.append(msgLengths[2 * i]) utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:( msgMarkerCount + msgLengths[2 * i + 1])] msgMarkerCount += msgLengths[2 * i + 1] replyLengthsNew.append(msgLengths[2 * i + 1]) utterances[i]["convId"] = (utterances[i]["msgUserId"], utterances[i]["replyUserId"]) return utterances #Core calls start = logger1.initialize() #Reading in user info and tweets logger1.log("Reading user info...") users = readUserInfo() logger1.log("Reading messages...") result = readCSV(inputFile, users, numMarkers) rows = result["rows"] markers = result["markers"] #Shuffling tweets if any shuffling has been requested if (someShuffling): logger1.log(rows[0]) rows = shuffleUtterances(rows, shuffleIds, shuffleTweets, shuffleMarkers, combineMsgReply) logger1.log(rows[0]) #Adding user info & extracting markers from messages utterances = transformCSV(markers, users, rows)
def shuffleUtterances(utterances, shuffleIds, shuffleTweets, shuffleTokens, combineMsgReply): replyUserIds = [] msgUserIds = [] replyTweets = [] msgTweets = [] allReplyTokens = [] allMsgTokens = [] replyLengths = [] msgLengths = [] for i, utterance in enumerate(utterances): if(i % 10000 is 0): logger1.log("Adding to utterances " + str(i) + " of " + str(len(utterances))) msgUserIds.append(utterance["msgUserId"]) msgTweets.append(utterance["msgTokens"]) allMsgTokens.extend(utterance["msgTokens"]) msgLengths.append(len(utterance["msgTokens"])) if not combineMsgReply: #if we're shuffling msgs and replies together, put everything in msgs replyUserIds.append(utterance["replyUserId"]) replyTweets.append(utterance["replyTokens"]) allReplyTokens.extend(utterance["replyTokens"]) replyLengths.append(len(utterance["replyTokens"])) else: msgUserIds.append(utterance["replyUserId"]) msgTweets.append(utterance["replyTokens"]) allMsgTokens.extend(utterance["replyTokens"]) msgLengths.append(len(utterance["replyTokens"])) shuffle(msgUserIds); shuffle(msgTweets); shuffle(allMsgTokens) if not combineMsgReply: shuffle(replyUserIds); shuffle(replyTweets); shuffle(allReplyTokens) else: shuffle(msgLengths) #only shuffle msgLengths if we're combining msgs and replies replyMarkerCount = 0 msgMarkerCount = 0 msgLengthsNew = [] replyLengthsNew = [] for i, utterance in enumerate(utterances): utterances[i]["msg"] = "" utterances[i]["reply"] = "" if(shuffleIds): if not combineMsgReply: utterances[i]["msgUserId"] = msgUserIds[i] utterances[i]["replyUserId"] = replyUserIds[i] else: utterances[i]["msgUserId"] = msgUserIds[2*i] utterances[i]["replyUserId"] = msgUserIds[2*i+1] if(shuffleTweets): if not combineMsgReply: utterances[i]["msgTokens"] = msgTweets[i] utterances[i]["replyTokens"] = replyTweets[i] else: utterances[i]["msgTokens"] = msgTweets[2*i] utterances[i]["replyTokens"] = msgTweets[2*i+1] if(shuffleTokens): if not combineMsgReply: utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[i])] msgMarkerCount += msgLengths[i] utterances[i]["replyTokens"] = allReplyTokens[replyMarkerCount:(replyMarkerCount+replyLengths[i])] replyMarkerCount += replyLengths[i] else: utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i])] msgMarkerCount += msgLengths[2*i] msgLengthsNew.append(msgLengths[2*i]) utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i+1])] msgMarkerCount += msgLengths[2*i+1] replyLengthsNew.append(msgLengths[2*i+1]) utterances[i]["convId"] = (utterances[i]["msgUserId"],utterances[i]["replyUserId"]) return utterances
else: utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i])] msgMarkerCount += msgLengths[2*i] msgLengthsNew.append(msgLengths[2*i]) utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i+1])] msgMarkerCount += msgLengths[2*i+1] replyLengthsNew.append(msgLengths[2*i+1]) utterances[i]["convId"] = (utterances[i]["msgUserId"],utterances[i]["replyUserId"]) return utterances #Core calls start = logger1.initialize() #Reading in user info and tweets logger1.log("Reading user info...") users = readUserInfo() logger1.log("Reading messages...") result = readCSV(inputFile, users, numMarkers) rows = result["rows"] markers = result["markers"] #Shuffling tweets if any shuffling has been requested if(someShuffling): logger1.log(rows[0]) rows = shuffleUtterances(rows, shuffleIds, shuffleTweets, shuffleMarkers, combineMsgReply) logger1.log(rows[0]) #Adding user info & extracting markers from messages utterances = transformCSV(markers, users, rows)