Example #1
0
def readCSV(inputFile, users, numOfMarkers):
    reader = csv.reader(open(inputFile, errors="ignore"), dialect="excel-tab")
    csvheader = next(reader, None)
    utterances = []
    toReturn = []
    freqs = {}
    userPairs = {
    }  #Dictionary with key being msg-replier tuple value being whether reciprocal

    for i, row in enumerate(reader):
        row = processTweetCSVRow(row)
        if row is None:  #if the tweet pair is not valid (e.g., no text in at least one tweet), skip
            continue
        utterances.append(row)

        #calculating word frequencies in the dataset (only if getting markers from dataset)
        if markersFromData:
            for word in row["msgTokens"]:
                freqs[word] = freqs.get(word, 0) + 1
            for word in row["replyTokens"]:
                freqs[word] = freqs.get(word, 0) + 1

        #Adding pair to userPairs for reciprocity
        userPair = row["convId"]
        if userPair not in userPairs:
            userPairs[userPair] = False

    #Add reciprocity value to each row, compile into returnable list of dictionaries
    for utterance in utterances:
        utterance["reciprocity"] = ((utterance["replyUserId"],
                                     utterance["msgUserId"]) in userPairs)
        toReturn.append(utterance)

    #Using data to determine most common markers
    if markersFromData:
        markers = []
        freqs = [k for k in sorted(freqs, key=freqs.get, reverse=True)]
        subset = freqs[
            0:
            numOfMarkers]  #up to two types ([mention] & [url]) will be removed
        for word in subset:
            markers.append({"marker": word, "category": word})
    else:
        markers = alignment.readMarkers(markersFile, "excel-tab")

    return {"rows": toReturn, "markers": markers}
Example #2
0
def readCSV(inputFile, users, numOfMarkers):
	reader=csv.reader(open(inputFile,errors="ignore"),dialect="excel-tab")
	csvheader = next(reader, None)
	utterances = []
	toReturn = []
	freqs = {}
	userPairs = {}				#Dictionary with key being msg-replier tuple value being whether reciprocal
	
	for i, row in enumerate(reader):
		row = processTweetCSVRow(row)
		if row is None:			#if the tweet pair is not valid (e.g., no text in at least one tweet), skip
			continue
		utterances.append(row)

		#calculating word frequencies in the dataset (only if getting markers from dataset)
		if markersFromData:
			for word in row["msgTokens"]:
				freqs[word] = freqs.get(word, 0) + 1
			for word in row["replyTokens"]:
				freqs[word] = freqs.get(word, 0) + 1
		
		#Adding pair to userPairs for reciprocity
		userPair = row["convId"]
		if userPair not in userPairs:
			userPairs[userPair] = False
		
	#Add reciprocity value to each row, compile into returnable list of dictionaries
	for utterance in utterances:
		utterance["reciprocity"] = ((utterance["replyUserId"],utterance["msgUserId"]) in userPairs)
		toReturn.append(utterance)
	
	#Using data to determine most common markers
	if markersFromData:
		markers = []
		freqs = [k for k in sorted(freqs, key=freqs.get, reverse=True)]
		subset = freqs[0:numOfMarkers]		#up to two types ([mention] & [url]) will be removed
		for word in subset:
			markers.append({"marker": word, "category": word})
	else:
		markers = alignment.readMarkers(markersFile,"excel-tab")
	
	return {"rows": toReturn, "markers": markers}
 #	marker_list = alignment.readMarkers(markersFile)
 elif tag == '-o':  #-o: output file
     outputFile = featval
 elif tag == '-c':  #-c: match messages on categories rather than individual words
     useCategories = True
 elif tag == '-r':  #-r: match words defined by regular expressions (will automatically use categories)
     useREs = True
     useCategories = True
 elif tag == '-m':  #-m: use marker list from the specified file
     markersFile = featval
     #outputFile = corpus+'Results.'+os.path.splitext(os.path.split(markersFile)[1])[0]+'.csv'	#using the marker filename in the results filename
     outputFile = 'CHILDES_liwc/ENG-NA-MOR/' + corpus + '/Results.' + os.path.splitext(
         os.path.split(markersFile)[1]
     )[0] + '.csv'  #using the marker filename in the results filen
     print('Will load markers from', markersFile)
     marker_list = alignment.readMarkers(markersFile,
                                         dialect="excel-tab")
 elif tag == '-S':  #-S: Supreme Court analysis
     corpus = 'SCOTUS'
     outputFile = 'results/SCOTUS/SCOTUSResults.csv'
     markersFile = 'wordlists/liwc2007_converted.tsv'
     corpus_dir = 'data/SCOTUS/'
     Subdirs = True
     corpus_name = corpus
     marker_list = alignment.readMarkers(markersFile,
                                         dialect="excel-tab")
 elif tag == '-R':  #-R: extract dialogue participants' roles
     extractRoles = True
     writeRoleHeader = True
     roleOutputFile = 'results/SCOTUS/SCOTUSRoles.csv'
     wrf = open(roleOutputFile, 'w')
 else:
Example #4
0
useREs = False
useCategories = False
extractRoles = False

if len(sys.argv) > 1:
    for arg in sys.argv[1:]:
        tag = arg[0:2]
        featval = arg[3:]
        # VERY IMPORTANT: if you use -C or -S tag, make sure it comes first!! Other tags will be overwritten by it at the moment.
        if tag == "-C":  # -C: corpus name
            corpus = featval
            outputFile = corpus + "Results300.csv"
            markersFile = "wordlists/" + corpus + "Marker300.csv"
            corpus_dir = "data/" + corpus + "/"
            corpus_name = corpus
            marker_list = alignment.readMarkers(markersFile)
        elif tag == "-o":  # -o: output file
            outputFile = featval
        elif tag == "-c":  # -c: match messages on categories rather than individual words
            useCategories = True
        elif tag == "-r":  # -r: match words defined by regular expressions (will automatically use categories)
            useREs = True
            useCategories = True
        elif tag == "-m":  # -m: use marker list from the specified file
            markersFile = featval
            outputFile = (
                corpus + "Results." + os.path.splitext(os.path.split(markersFile)[1])[0] + ".csv"
            )  # using the marker filename in the results filename
            print("Will load markers from", markersFile)
            marker_list = alignment.readMarkers(markersFile, dialect="excel-tab")
        elif tag == "-S":  # -S: Supreme Court analysis