def readCSV(inputFile, users, numOfMarkers): reader = csv.reader(open(inputFile, errors="ignore"), dialect="excel-tab") csvheader = next(reader, None) utterances = [] toReturn = [] freqs = {} userPairs = { } #Dictionary with key being msg-replier tuple value being whether reciprocal for i, row in enumerate(reader): row = processTweetCSVRow(row) if row is None: #if the tweet pair is not valid (e.g., no text in at least one tweet), skip continue utterances.append(row) #calculating word frequencies in the dataset (only if getting markers from dataset) if markersFromData: for word in row["msgTokens"]: freqs[word] = freqs.get(word, 0) + 1 for word in row["replyTokens"]: freqs[word] = freqs.get(word, 0) + 1 #Adding pair to userPairs for reciprocity userPair = row["convId"] if userPair not in userPairs: userPairs[userPair] = False #Add reciprocity value to each row, compile into returnable list of dictionaries for utterance in utterances: utterance["reciprocity"] = ((utterance["replyUserId"], utterance["msgUserId"]) in userPairs) toReturn.append(utterance) #Using data to determine most common markers if markersFromData: markers = [] freqs = [k for k in sorted(freqs, key=freqs.get, reverse=True)] subset = freqs[ 0: numOfMarkers] #up to two types ([mention] & [url]) will be removed for word in subset: markers.append({"marker": word, "category": word}) else: markers = alignment.readMarkers(markersFile, "excel-tab") return {"rows": toReturn, "markers": markers}
def readCSV(inputFile, users, numOfMarkers): reader=csv.reader(open(inputFile,errors="ignore"),dialect="excel-tab") csvheader = next(reader, None) utterances = [] toReturn = [] freqs = {} userPairs = {} #Dictionary with key being msg-replier tuple value being whether reciprocal for i, row in enumerate(reader): row = processTweetCSVRow(row) if row is None: #if the tweet pair is not valid (e.g., no text in at least one tweet), skip continue utterances.append(row) #calculating word frequencies in the dataset (only if getting markers from dataset) if markersFromData: for word in row["msgTokens"]: freqs[word] = freqs.get(word, 0) + 1 for word in row["replyTokens"]: freqs[word] = freqs.get(word, 0) + 1 #Adding pair to userPairs for reciprocity userPair = row["convId"] if userPair not in userPairs: userPairs[userPair] = False #Add reciprocity value to each row, compile into returnable list of dictionaries for utterance in utterances: utterance["reciprocity"] = ((utterance["replyUserId"],utterance["msgUserId"]) in userPairs) toReturn.append(utterance) #Using data to determine most common markers if markersFromData: markers = [] freqs = [k for k in sorted(freqs, key=freqs.get, reverse=True)] subset = freqs[0:numOfMarkers] #up to two types ([mention] & [url]) will be removed for word in subset: markers.append({"marker": word, "category": word}) else: markers = alignment.readMarkers(markersFile,"excel-tab") return {"rows": toReturn, "markers": markers}
# marker_list = alignment.readMarkers(markersFile) elif tag == '-o': #-o: output file outputFile = featval elif tag == '-c': #-c: match messages on categories rather than individual words useCategories = True elif tag == '-r': #-r: match words defined by regular expressions (will automatically use categories) useREs = True useCategories = True elif tag == '-m': #-m: use marker list from the specified file markersFile = featval #outputFile = corpus+'Results.'+os.path.splitext(os.path.split(markersFile)[1])[0]+'.csv' #using the marker filename in the results filename outputFile = 'CHILDES_liwc/ENG-NA-MOR/' + corpus + '/Results.' + os.path.splitext( os.path.split(markersFile)[1] )[0] + '.csv' #using the marker filename in the results filen print('Will load markers from', markersFile) marker_list = alignment.readMarkers(markersFile, dialect="excel-tab") elif tag == '-S': #-S: Supreme Court analysis corpus = 'SCOTUS' outputFile = 'results/SCOTUS/SCOTUSResults.csv' markersFile = 'wordlists/liwc2007_converted.tsv' corpus_dir = 'data/SCOTUS/' Subdirs = True corpus_name = corpus marker_list = alignment.readMarkers(markersFile, dialect="excel-tab") elif tag == '-R': #-R: extract dialogue participants' roles extractRoles = True writeRoleHeader = True roleOutputFile = 'results/SCOTUS/SCOTUSRoles.csv' wrf = open(roleOutputFile, 'w') else:
useREs = False useCategories = False extractRoles = False if len(sys.argv) > 1: for arg in sys.argv[1:]: tag = arg[0:2] featval = arg[3:] # VERY IMPORTANT: if you use -C or -S tag, make sure it comes first!! Other tags will be overwritten by it at the moment. if tag == "-C": # -C: corpus name corpus = featval outputFile = corpus + "Results300.csv" markersFile = "wordlists/" + corpus + "Marker300.csv" corpus_dir = "data/" + corpus + "/" corpus_name = corpus marker_list = alignment.readMarkers(markersFile) elif tag == "-o": # -o: output file outputFile = featval elif tag == "-c": # -c: match messages on categories rather than individual words useCategories = True elif tag == "-r": # -r: match words defined by regular expressions (will automatically use categories) useREs = True useCategories = True elif tag == "-m": # -m: use marker list from the specified file markersFile = featval outputFile = ( corpus + "Results." + os.path.splitext(os.path.split(markersFile)[1])[0] + ".csv" ) # using the marker filename in the results filename print("Will load markers from", markersFile) marker_list = alignment.readMarkers(markersFile, dialect="excel-tab") elif tag == "-S": # -S: Supreme Court analysis