def test_runs(self): #read sentences from a file import csv from findCon import run_singlep,run_multip conlist = [] with open('./concepts.list','r') as csvfile: reader = csv.reader(csvfile) for row in reader: conlist.append(Concept(row[0])) with open('./test.list','r') as csvfile: reader = csv.reader(csvfile,delimiter=',') for row in reader: specs = { 'sen': Sentence(row[0]), 'conlist': conlist } y1 = run_multip(specs) y2 = run_singlep(specs) y3 = [r for r in row[1:] if r!=''] case = set(y1)==set(y2)==set(y3) self.assertTrue(case)
def texts_to_sentences(id, texts): """ Converts texts into sentences :type texts: list of texts :return: list of sentences """ sentences = [] sentence = Sentence() distance_from_run_offset = 0 for text in texts: content = '' beginningOfSentenceOffset = text.run_offset chars = text.processed_content() for char in chars: content = content + char distance_from_run_offset += 1 # if content is only bolded or italicized text, it should not be its own sentence contentCopy = copy.deepcopy(content) result = re.search('(BOLD_END.*<BOLD>)|(ITALICS_END.*<ITALICS>)', contentCopy) final = False if result is not None and len(result.group()) > 14: final = True # if reach a period if '.' in char and (('<BOLD>' not in contentCopy[:7] or final) and ('<ITALICS>' not in contentCopy[:10] or final)) or content.count('.') == 2: sentence.add_text( Text(text.run_offset, text.content, beginningOfSentenceOffset - text.run_offset, len(content))) sentences.append(sentence) sentence = Sentence() content = '' beginningOfSentenceOffset = text.run_offset + distance_from_run_offset sentence.add_text( Text(text.run_offset, text.content, beginningOfSentenceOffset - text.run_offset, len(content))) distance_from_run_offset = 0 if content != '': sentences.append(sentence) return sentences
# final_df = final_df[~final_df.business_id.isnull()] final_df = final_df.dropna(axis=0, subset=[ 'sentence', 'business_id' ]) ## MY ADDITION HOPING IT DOESN"T MESS THINGS UP final_df.to_pickle('final_df.pkl') ## REMOVE # FEATURIZE ## Import Sentence class from this project # import sys # sys.path.append("/var/www/sandbox/ben/opinion-mining") from classes.sentence import Sentence print "Featurizing the training data frame (may take a little while)" sents = [Sentence(sent) for sent in final_df.sentence] for sent, stars in zip(sents, final_df.review_stars): sent.stars = stars # pass the number of stars in featurized_df = pd.DataFrame([sent.get_features() for sent in sents]) featurized_df['sentiment'] = final_df.sentiment featurized_df = featurized_df[~featurized_df.sentiment.isnull()] print "Done." # ipdb.set_trace() # Adjust sentiment labels featurized_df.sentiment[featurized_df.sentiment == 'Positive'] = 1 featurized_df.sentiment[featurized_df.sentiment == 'Negative'] = -1
from classes.sentence import Sentence from classes.tag import Tag tag1 = Tag(1, 6, "mylabel") tag2 = Tag(10, 16, "something") tag3 = Tag(25, 30, "whatever") sentence = Sentence("black", [tag1, tag2, tag3]) print(sentence.sentence) print(sentence.tags[0].start)
description='Script to convert a list of strings to concepts') #epochs, batch_size and model ID parser.add_argument('--sen', type=str, default='Which restaurants do West Indian ?food', help='sentence string') parser.add_argument( '--fpath', type=str, default='./concepts.list', help='location of file containing the list of concepts') args = parser.parse_args() #arguments from the parser sen = Sentence(args.sen) conlist_fpath = args.fpath #read the list of concepts from the specified file conlist = [] with open(conlist_fpath, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: conlist.append(Concept(row[0])) ''' the preprocessing innvolves on-the-fly generation of table for each concept in the csv. This can be either be done apriori for the whole databse or can be done on-the-fly got batches of data for data_parallilsm case '''
def preprocessText(text_file): number = 0 currentIndex = 0 # Could be 6 to keep the current numbers of the original annotations sentences = [] subject = "" additionalInformation = "" file = open(text_file, 'r') text = file.read() separateLines = text.splitlines() beginIndex = separateLines.index("<TEXT>") + 1 endIndex = separateLines.index("</TEXT>") # For wsj files if "wsj" in text_file: for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] if "<HL>" in separateLines[s]: beginSubject = s if "</HL>" in separateLines[s]: endSubject = s # Getting document data date = separateLines[endSubject + 1] journal = separateLines[endSubject + 2] for part in range(0, beginIndex - 1): if beginSubject <= part <= endSubject - 1: subject += separateLines[part] + str(" ") elif part == endSubject: subject += separateLines[part] if part > endSubject + 2: if "<DATELINE>" in separateLines[part]: additionalInformation += separateLines[part][10:-11] else: additionalInformation += separateLines[part] subject = subject[5:-6] dateParts = date.split("/") if (int(dateParts[2])) > 20: newDate = "19" + dateParts[2][:2] else: newDate = "20" + dateParts[2][:2] newDate = newDate + "-" + dateParts[0][1:] + "-" + dateParts[ 1] + "T00:00:00" # FOR ABC files if "ABC" in text_file: beginIndex = beginIndex + 1 # Extra enter between <TEXT> and the text subject = "NEWS STORY" journal = "broadcast news" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] break newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[ 9:11] + "T00:00:00" if "ea" in text_file or "ed" in text_file: beginIndex = beginIndex + 1 # Extra enter between <TEXT> and the text subject = "NEWS STORY" journal = "broadcast" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] break newDate = "19" + documentNo[2:4] + "-" + documentNo[ 4:6] + "-" + documentNo[6:8] + "T00:00:00" if "PRI" in text_file: beginIndex = beginIndex + 1 # Extra enter between <TEXT> and the text subject = "NEWS STORY" journal = "broadcast" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] break newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[ 9:11] + "T00:00:00" if "CNN" in text_file: beginIndex = beginIndex + 1 # Extra enter between <TEXT> and the text subject = "NEWS STORY" journal = "CNN" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] break newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[ 9:11] + "T00:00:00" if "VOA" in text_file: beginIndex = beginIndex + 1 # Extra enter between <TEXT> and the text subject = "NEWS STORY" journal = "VOA" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] break newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[ 9:11] + "T00:00:00" if "XIE" in text_file: subject = "NEWS STORY" journal = "broadcast" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][9:-9] if "DATE_TIME" in separateLines[s]: datefile = separateLines[s][12:-13] if "<HEADLINE>" in separateLines[s]: start_subject = s + 1 if "</HEADLINE>" in separateLines[s]: end_subject = s for line in range(start_subject, end_subject): subject += separateLines[line] + " " newDate = datefile[1:] + "T00:00:00" # FOR AP files if "AP9" in text_file: subject = "" journal = "Associated Press Writer" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] if "<SECOND>" in separateLines[s]: start_subject = s + 1 if "LaserPhotos" in separateLines[s]: end_subject = s for line in range(start_subject, end_subject): subject += separateLines[line] newDate = "19" + documentNo[2:4] + "-" + documentNo[ 4:6] + "-" + documentNo[6:8] + "T00:00:00" if "APW" in text_file or "NYT" in text_file: subject = "" journal = "Associated Press Writer" if "NYT" in text_file: journal = "NYT" for s in range(len(separateLines)): if "DOCNO" in separateLines[s]: documentNo = separateLines[s][8:-9] if "DATE_TIME" in separateLines[s]: datefile = separateLines[s][12:-13] if "<HEADLINE>" in separateLines[s]: start_subject = s + 1 if "</HEADLINE>" in separateLines[s]: end_subject = s for line in range(start_subject, end_subject): subject += separateLines[line] + " " for add in range(end_subject + 1, beginIndex - 1): additionalInformation += separateLines[add] + " " newDate = datefile[6:10] + "-" + datefile[:2] + "-" + datefile[ 3:5] + "T" + datefile[11:19] # Generate sentence objects for sentence in range(beginIndex, endIndex): if (len(separateLines[sentence]) == 0): currentIndex += 1 # empty line as extra index continue if "---" in separateLines[sentence]: currentIndex += 4 continue sentences.append( Sentence(currentIndex, separateLines[sentence], number)) currentIndex = sentences[-1].endIndex + 1 number += 1 document = Document(subject, journal, documentNo, additionalInformation, newDate) return (document, sentences)