def pre_post_cleansing(rownumber, csvfilename): reader = csv.DictReader(open(csvfilename, encoding = "utf-8")) rows = [row for row in reader] before = rows[rownumber]["srvc_req_prob_text"] after = wordslist2string(cleanStringAndLemmatize(before)) print(before, " becomes ", after) return before, after
def onecorpus(csvfilename, num): preparecwd(num) if not os.path.exists(str(num)): os.makedirs(str(num)) reader = csv.DictReader(open(csvfilename, encoding = "utf-8")) for row in reader: name = row["sr_closer_name"] docname = name.replace("," , "_").replace(" " , "") filepath = str(num) + docname + ".txt" with open(filepath, "a", encoding = "utf-8") as file: #write processed string, not raw string from csv newline = wordslist2string(cleanStringAndLemmatize(str(row["srvc_req_prob_text"]))) if len(newline) > 2: newline = newline + " \n " file.write(newline)
def twocorpora(csvfilename, num, percentintraining): preparecwd(num) if not os.path.exists(str(num) + "/TrainCorpus"): os.makedirs(str(num) + "/TrainCorpus") if not os.path.exists(str(num) + "/TestCorpus"): os.makedirs(str(num) + "/TestCorpus") reader = csv.DictReader(open(csvfilename, encoding = "utf-8")) for row in reader: name = row["sr_closer_name"] docname = name.replace("," , "_").replace(" " , "") if random.randrange(0,100)/100 < percentintraining: filepath = str(num) + "/TrainCorpus"+ "/" + docname + ".txt" else: filepath = str(num) + "/TestCorpus"+ "/" + docname + ".txt" with open(filepath, "a", encoding = "utf-8") as file: #write processed string, not raw string from csv newline = wordslist2string(cleanStringAndLemmatize(str(row["srvc_req_prob_text"]))) if len(newline) > 2: newline = newline + " \n " file.write(newline)
def match(): try: print(request) myjson = request.get_json(force=True) print("this is ",myjson) except Exception as inst: print(type(inst),inst.args,inst,"get getTSEs get json Fail!") return errorResponseVanilla("getTSEs "+str(type(inst)), status=500) try: print(myjson["problem_description"]) probdesc = wordslist2string(cleanStringAndLemmatize(myjson["problem_description"])) predictions = temp.queryalgorithm(probdesc) predictions = [changeback(prediction) for prediction in predictions] except Exception as inst: return errorResponseVanilla("getTSEs "+str(type(inst)), status=500) data = { "error":"", "TSEs": predictions } print(data) js = json.dumps(data) return jsonVanilla(js,status=200)
def __iter__(self): for row in self.reader: name = row["sr_closer_name"] docname = name.replace("," , "_").replace(" " , "") newline = wordslist2string(cleanStringAndLemmatize(str(row["srvc_req_prob_text"]))) yield docname, newline
# __author__ = 'basil.beirouti' import csv, datetime, random from BM25 import last_thousand from BM25.Scheduling import whos_on, read_filtered_csv_file, read_raw_schedule_csv, write_filtered_csv_file, this_year, docmatrix_data from BM25.Plugins import tuples_tse_psums_concat from BM25.TextCleaning import wordslist2string, cleanStringAndLemmatize from BM25.BM25Okapi import QueryMaster, DocMatrix def rand_divide(data, proportion): lendata = len(data) numgroup1 = round(proportion*lendata) numgroup2 = lendata-numgroup1 random.shuffle(data) random.shuffle(data) group1 = data[0:numgroup1] group2 = data[numgroup1:] assert(numgroup1 == len(group1)) assert(numgroup2 == len(group2)) return group1, group2 rows = read_filtered_csv_file() on_now = whos_on(rows) personids = [el[1] for el in on_now] out = docmatrix_data(personids, 500) srnums, badgenums, personids, fns, lns, psums, dates = zip(*out) tupsdata = [(el[3] + el[4], el[5]) for el in out] cleaned_data= [(el[0], wordslist2string(cleanStringAndLemmatize(el[1]))) for el in tupsdata] train, test = rand_divide(cleaned_data, 0.75) processed_data = tuples_tse_psums_concat(train) DocMatrix(processed_data, )
def clean_docmatrix_data(raw_data): #lastname_firstname, cleanedproblemsummary temp = [(el[3].replace(" ", "") + "_" + el[2].replace(" ", ""), wordslist2string(cleanStringAndLemmatize(el[4]))) for el in raw_data] temp.sort(key = operator.itemgetter(0)) return temp
def csv_to_tups(csvfilename): reader = csv.DictReader(open(csvfilename, encoding = "utf-8")) alldata = list((changename(row["sr_closer_name"]), wordslist2string(cleanStringAndLemmatize(row["srvc_req_prob_text"])))for row in reader) alldata = sorted(alldata, key = operator.itemgetter(0)) return alldata