def get_date(pdfdata): text = parse_layout(pdfdata) pdflines = iter(text.splitlines()) date = [] for line in pdflines: try: extracted = list(datefinder.find_dates(line)) except: continue if extracted: date.extend(extracted) return date
def get_est_date(self, start_d, end_d = datetime.today(), n_first=10, n_last=10, strictness=True): # tokenize text by sentence sentences = sent_tokenize(self) # check to see if there's a date in the first n lines selection = ' '.join(sentences[0:n_first]) dates = datefinder.find_dates(selection, strict=strictness) # convert datefinder object to a list of dates dates = [d.replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None) for d in dates] # verify dates are legitimate legit_dates = [d for d in dates if check_date(d, start_d, end_d) == True] # if any legitimate date exists in the first n lines, return the first one if len(legit_dates) > 0: return legit_dates[0] # if not repeat the process with the last n lines in the document slice = (n_last * -1) - 1 selection = ' '.join(sentences[slice:-1]) dates = datefinder.find_dates(selection, strict=strictness) # convert datefinder object to a list of dates dates = [d.replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None) for d in dates] # verify dates are legitimate legit_dates = [d for d in dates if check_date(d, start_d, end_d) == True] # if any legitimate date exists in the first n lines, return the first one if len(legit_dates) > 0: # if there is, return the first date from the first ten sentences return legit_dates[0] else: return None
def test_find_date_strings(input_text, expected_date): if isinstance(expected_date,list): matches = list(datefinder.find_dates(input_text)) assert matches == expected_date else: return_date = None for return_date in datefinder.find_dates(input_text): assert return_date == expected_date assert return_date is not None # handles dates that were never matched
def getvec(lines): ''' features: number of streets(0), cities(1), states(2), countries(3): INT sum of weights of the streets: FLOAT (4) has phone number?: 0/1 (5) zip codes?: 0/1 (6) length of paragraph: INT (7) has date?: 0/1 (8) This method calculates the feature vector for a single paragraph using the above features Parameters ---------- data : A list of paragraphs which forms a synthetic blog Returns ------- vec : A list length 8 ''' vec = [0]*9 for line in lines: phnum = len(rephone.findall(line)) nums = len(renum.findall(line)) numterm = 0 for terms in st.tokenize(line): numterm+=1 # terms = terms.lower() if terms.lower() in streets: vec[0] += 1 vec[4] += streets[terms.lower()]/float(summ) if terms in states: # state names are biased towards US and Australia addresses # therefore we don't add their weights vec[1] += 1 if terms in cities: vec[2] += 1 if terms in countries: vec[3] += 1 vec[5] = phnum vec[6] = nums vec[7] = numterm matches = datefinder.find_dates(line, strict=True) try: for match in matches: vec[8] = 1 break except: pass return vec
def date_extract(input_string): """ exracts date from a given input string """ matches = list(datefinder.find_dates(input_string)) date = {} date["year"] = matches[0].year date["day"] = matches[0].day date["minutes"] = matches[0].minute date["hour"] = matches[0].hour date["month"] = matches[0].month date["second"] = matches[0].second return date
def check_if_date_present(text): ''' Check if a date/schedule is present in the text. :param text: :return: True if date present, else False ''' timex_tag = tag(text) if re.search('<TIMEX2>',timex_tag): return True matches = datefinder.find_dates(text) for match in matches: if match: return True return False
def check(url): response = urllib.urlopen(url) doc = response.read() dates_list = datefinder.find_dates(doc) days_ago = [(datetime.datetime.today() - date).days for date in dates_list] if (days_ago == []): return {'status':'No Dates','code':-1,'last_entity':0} else: last_date_validated = min([i for i in days_ago if i > 0]) if last_date_validated > 100: return {'status':'Not Maintained','code':0,'last_entity':last_date_validated} else: return {'status':'Maintained','code':1,'last_entity':last_date_validated}
def __get_events(logs_with_id): log_index, line = logs_with_id line = line.lower() # GET month matches = datefinder.find_dates(line, source=True) months = [] for match in matches: month = sub('[^a-zA-Z]', '', match[1]) if month: months.append(month) # only leave alphabet, maintain word split line = line.split() line_split = [] for li in line: alphabet_only = sub('[^a-zA-Z]', '', li) line_split.append(alphabet_only) # GET preprocessed_event_countgroup # remove more than one space line = ' '.join(line_split) line = ' '.join(line.split()) preprocessed_event_countgroup = line # GET preprocessed_events # remove word with length only 1 character for index, word in enumerate(line_split): if len(word) == 1: line_split[index] = '' # remove more than one space line = ' '.join(line_split) line = ' '.join(line.split()) # remove stopwords stopwords = corpus.stopwords.words('english') stopwords_month = stopwords if months: stopwords_month.extend(months) stopwords_result = [word for word in line.split() if word not in stopwords_month] preprocessed_events = ' '.join(stopwords_result) preprocessed_events_graphedge = preprocessed_events preprocessed_with_id = (log_index, preprocessed_events, preprocessed_event_countgroup, preprocessed_events_graphedge) return preprocessed_with_id
def hasdate(address): ''' Function for removing dates from addresses. Dates were still coming in the addresses, so I deecided to manually filter them out Parameters ---------- address : predictors : a list of tuples which are like (parameters, model) Here parameters is a dictionary of the hyper-parameters of the model Returns ------- final : A list of lists, where every list contains the index of paragraph which are the part of the same address. ''' str1 = " ".join(address) matches = datefinder.find_dates(str1, strict=True) for match in matches: return True return False
def test_find_date_strings(input_text, expected_date): for return_date in datefinder.find_dates(input_text): assert return_date == expected_date
def find_date(self, row): if list(datefinder.find_dates(row, strict=True)): return list(datefinder.find_dates(row, strict=True))[0].strftime('%-m/%-d/%Y') else: return None
import datefinder matches=datefinder.find_dates("obama was born on January 4th,2017 at 8:00.He got married on 12 june 2008") for match in matches: print(match)
def write_state(self, file, course): course_info = self.check_course(course).get() if not course_info: raise MyException('Не найден курс с таким номером.') text_error = "" wb = openpyxl.load_workbook(io.BytesIO(file)) groups = {format_group(i): i for i in wb.get_sheet_names()} self.sql = f""" SELECT name FROM "group-cource_rels", groups WHERE cource_id = {course} AND group_id=id """ for name in {i for (i, ) in self.all()} & set(groups.keys()): sheet = wb.get_sheet_by_name(groups[name]) first_data, last_data = {}, {} dates = [] for row in sheet.values: if not dates: dates = list(filter(bool, row[3:])) continue key = tuple(row[:3]) first_data[key] = tuple(row[3:]) group_data = Student().create_group(name) self.sql = f""" SELECT id, concat(last_name, first_name, patronymic) as fio, gradebook_identy FROM students WHERE group_id = {group_data[0]} """ students = self.all() for stud in first_data.keys(): check_id = lambda x: x[0] == stud[0] check_number = lambda x: x[2] == str(stud[2]).replace(' ', '') check_name = lambda x: x[1].lower() == str(stud[1]).replace( ' ', '').lower() be = None if stud[0]: be = [i for i in students if check_id(i)] elif stud[2]: be = [i for i in students if check_number(i)] elif stud[1]: be = [i for i in students if check_name(i)] if be and len(be) == 1: new_key = list(stud) new_key[0] = be[0][0] last_data[tuple(new_key)] = first_data[stud] elif not be and stud[1] and stud[2]: r = compile(r'^[ \n]*(?P<fio>[\w]+ [\w]+ [\w]+)[ \n]*$') fio = r.search(stud[1]) if fio: fio = fio.group('fio').split(' ') student = { 'last_name': fio[0], 'first_name': fio[1], 'patronymic': fio[2], 'id': None } try: gradebook = int(stud[2]) except (ValueError, TypeError): text_error += f'Номер зачетки должен быть числом. {stud[2]} \n' continue student.update({ 'group_id': name, 'gradebook': gradebook, }) new_key = list(stud) new_key[0] = Student().create(student, True).commit_return()[0] last_data[tuple(new_key)] = first_data[stud] else: text_error += f"Не удалось однозначно определить студента {stud}) \n" if not last_data: continue self.sql = f""" SELECT id, date_time FROM lessons WHERE cource_id = {course} AND group_id = {group_data[0]} """ all_lesson = self.all() last_date = {} for lesson in dates: date_lesson = list(datefinder.find_dates(lesson)) if not date_lesson: last_date[lesson] = None continue check = [ i for i in all_lesson if i[1] - timedelta(minutes=20) <= date_lesson[0] <= i[1] + timedelta(minutes=20) ] print(check) if check: last_date[lesson] = check[0][0] else: last_date[lesson] = self.create_lesson( course, group_data[0], lesson).commit_return()[0] self.sql = f""" INSERT INTO student_visits(student_id, lesson_id, visited) VALUES {','.join(f'({i[0]}, {last_date[lesson]}, false)' for i in students)} ON CONFLICT DO NOTHING """ self.commit() for stud, values in last_data.items(): for index, val in enumerate(values): if not val or not last_date[dates[index]]: continue if isinstance(val, int): self.sql = f""" INSERT INTO student_performance(student_id, lesson_id, points) VALUES ({stud[0]}, {last_date[dates[index]]}, {val}) ON CONFLICT (student_id, lesson_id) DO UPDATE SET points = EXCLUDED.points """ self.commit() self.sql = f""" INSERT INTO student_visits(student_id, lesson_id, visited) VALUES ({stud[0]}, {last_date[dates[index]]}, {val != '-'}) ON CONFLICT (student_id, lesson_id) DO UPDATE SET visited = EXCLUDED.visited """ self.commit() return text_error
def get_svo(self, sentence): ''' get SVO of single sentence ''' parsed_phrase = self.nlp(sentence) names = list(parsed_phrase.ents) corrected_names = [] persons = [] locations = [] organizations = [] event_date = [] norp = [] facilities = [] events = [] for e in names: linked = e.text if any(map(str.isupper, linked)) and any( ext in linked for ext in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'week', 'Yesterday', 'month', 'day', 'Today']) == False: corrected_names.append(linked) if e.label_ == 'GPE' or e.label == 'LOC': locations.append(e.text) if e.label_ == 'PERSON': persons.append(e.text) if e.label_ == 'ORG': organizations.append(e.text) if e.label == 'NORP': norp.append(e.text) if e.label == 'FACILITY' or e.label == 'PRODUCT': facilities.append(e.text) if e.label == 'EVENT': events.append(e.text) subjects = [] objects = [] verbs = [] for text in parsed_phrase: if text.dep_.startswith("nsubj") or text.dep_ in ['conj']: subject = text.orth_ subjects.append(subject) if text.dep_ in ["dobj", 'pobj']: object_ = text.orth_ objects.append(object_) if text.pos_ == "VERB": verb = text.orth_ verbs.append(verb) # event date try: event_date = list(set(sentence.replace('.', '').split()) & set(['Monday', 'Tuesday', 'Wednesday', 'Tursday', 'Friday', 'Saturday', 'Sunday', 'Today', 'today', 'Tomorrow', 'tomorrow', 'Yesterday', 'yesterday']))[0] except: try: event_date = list(datefinder.find_dates(sentence))[0] if str(event_date.year) not in sentence: event_date = str(event_date.month) + '/' + str(event_date.day) event_date = str(event_date) except: event_date = None return {'Sentence': sentence, 'Subjects': subjects, 'Predicates': verbs, 'Objects': objects, 'Names': corrected_names, 'Event_date': event_date, 'Persons': persons, 'Locations': locations, 'Organizations': organizations, 'NORP': norp, 'Facilities': facilities, 'Events': events}
def volunteer_slot(service): clinician_email = service.calendarList().get( calendarId='primary').execute()['id'] print("Please enter your volunteer slots Date and Start-Time.") input_time = input( colored("Day Month Time - e.g: 14 Dec 14:30]: ", 'yellow')) summary = input("Please enter the slot summary: ") description = input("Please enter the slot description: ") increment_time = 30 start_date_time = list(datefinder.find_dates(input_time))[0] start_date_time_str = start_date_time.strftime("%Y-%m-%dT%H:%M:%S+02:00") end_date_time = start_date_time + timedelta(minutes=increment_time) end_date_time_str = end_date_time.strftime("%Y-%m-%dT%H:%M:%S+02:00") google_meet_id = uuid4().hex for i in tqdm(range(3)): event = { 'summary': summary, 'description': description, 'start': { 'dateTime': start_date_time_str, 'timeZone': 'Africa/Johannesburg', }, 'end': { 'dateTime': end_date_time_str, 'timeZone': 'Africa/Johannesburg', }, 'attendees': [{ 'email': clinician_email, 'responseStatus': 'accepted' }], 'reminders': { 'useDefault': False, 'overrides': [ { 'method': 'email', 'minutes': 24 * 60 }, { 'method': 'popup', 'minutes': 10 }, ], }, 'conferenceData': { "createRequest": { "requestId": google_meet_id, "conferenceSolutionKey": { "type": "hangoutsMeet" } } }, } event = service.events().insert(calendarId=CLINIC_CALENDAR_ID, body=event, conferenceDataVersion=1).execute() # print('Event created: {0}'.format(event.get('htmlLink'))) # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(event) start_date_time = start_date_time + timedelta(minutes=increment_time) start_date_time_str = start_date_time.strftime( "%Y-%m-%dT%H:%M:%S+02:00") end_date_time = end_date_time + timedelta(minutes=increment_time) end_date_time_str = end_date_time.strftime("%Y-%m-%dT%H:%M:%S+02:00") print( colored('Your volunteer slot for ', 'green') + colored(summary, 'yellow') + colored(' was successfully created.', 'green'))
def extract_and_write_posts(elements, filename): import time as tt start = tt.time() print(start,len(elements)) try: f = open(filename, "w", newline="\r\n") f.writelines( "TIME||TYPE||TITLE||STATUS||LINKS" + "\n" + "\n" ) for x in elements: try: title = " " status = " " link = "" # time # time = x.find_all('abbr')[0]['title'] # url = x.find_element_by_xpath('//a[contains(@href,"href")]') # # title # title = utils.get_title_bs(x, selectors) # if title.text.find("shared a memory") != -1: # x = x.find_all('div',attrs={'class':'_1dwg _1w_m'}) # title = utils.get_title_bs(x, selectors) # status = utils.get_status_bs(x, selectors) # if ( # title.text # == driver.find_element_by_id(selectors.get("title_text")).text # ): # if status == "": # temp = utils.get_div_links_bs(x, "img", selectors) # if ( # temp == "" # ): # no image tag which means . it is not a life event # link = utils.get_div_links_bs(x, "a", selectors)[ # "href" # ] # type = "status update without text" # else: # type = "life event" # link = utils.get_div_links_bs(x, "a", selectors)[ # "href" # ] # status = utils.get_div_links_bs(x, "a", selectors).text # else: # type = "status update" # if utils.get_div_links_bs(x, "a", selectors) != "": # link = utils.get_div_links_bs(x, "a", selectors)[ # "href" # ] # elif title.text.find(" shared ") != -1: # x1, link = utils.get_title_links_bs(title) # type = "shared " + x1 # # elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1: # # if title.text.find(" at ") != -1: # # x1, link = utils.get_title_links(title) # # type = "check in" # # elif title.text.find(" in ") != 1: # # status = utils.get_div_links(x, "a", selectors).text # # elif ( # # title.text.find(" added ") != -1 and title.text.find("photo") != -1 # # ): # # type = "added photo" # # link = utils.get_div_links(x, "a", selectors).get_attribute("href") # # elif ( # # title.text.find(" added ") != -1 and title.text.find("video") != -1 # # ): # # type = "added video" # # link = utils.get_div_links(x, "a", selectors).get_attribute("href") # else: # type = "others" # if not isinstance(title, str): # title = title.text # status = status.replace("\n", " ") # title = title.replace("\n", " ") linkdata = x.find_all('a', href=True, role='link') timedata = x.find_all('a',role="link",tabindex="0") for i in range(len(timedata)): try: tryts = timedata[i]['aria-label'] if len(list(datefinder.find_dates(tryts))) != 0: time = list(datefinder.find_dates(tryts))[0].strftime("%m/%d/%Y") break except: pass if len(elements) <= 1: time = datetime.now().strftime("%m/%d/%Y") for sub in linkdata: try: link = sub['href'] if ids in link or 'https://www.facebook.com/' in link: link = "" elif link != '#': # print(link) break except: pass line = ( time + " || " + ' ' + " || " + ' ' + " || " + ' ' + " || " + str(link) + "\n" ) try: f.writelines(line) except Exception: print("Posts: Could not map encoded characters") except Exception: pass f.close() print(tt.time() - start) except Exception: print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0]) return
# string_with_dates = ''' # MSDS #: RQ1007792 Issue Date: 09/2010 Supersedes: RQ0900706 Issue Date: 01/2009 # ''' text_file = open( "D:/Dropbox/_ICF_project/WA 2-75/HD/Delivery/Extract_summary/date_raw.txt", "r") lines = text_file.readlines() lines_post = [] for k in lines: # print "k===", k try: matches = datefinder.find_dates(k) # for match in matches: # print match matches_str = " || ".join([str(z).replace('\n', "") for z in matches]) # print matches_str lines_post.append(matches_str) except: lines_post.append("N/A") # print len(lines_post) # print lines_post with open( "D:/Dropbox/_ICF_project/WA 2-75/HD/Delivery/Extract_summary/date_raw2.txt", 'w') as f: for s in lines_post:
def parseCase(self, response): try: petitioner = response.css('h2 ::text').extract()[2].split(',')[0] except: petitioner = "" try: respondent = response.css('h2 ::text').extract()[0].split(',')[0] except: respondent = "" try: judgement_text = " ".join(response.css('pre ::text').extract()) except: judgement_text = " ".join( response.css('.col-sm-9 ::text').extract()) sentences = judgement_text.split('\n') source = 'California Court of Appeal' matches = list( datefinder.find_dates( response.css( 'h3+ .bottom .meta-data-value ::text').extract_first())) date = matches[-1].day month = matches[-1].month year = matches[-1].year paras = judgement_text.split('\n\n') last_paras = ' '.join(paras[-15:]) if 'affirmed' in last_paras: judgement = 'dismissed' elif 'denied' in last_paras: judgement = 'dismissed' elif 'dismissed' in last_paras: judgement = 'dismissed' elif 'reversed' in last_paras: judgement = 'allowed' else: last_paras = ' '.join(paras[-20:]) if 'affirmed' in last_paras: judgement = 'dismissed' elif 'denied' in last_paras: judgement = 'dismissed' elif 'dismissed' in last_paras: judgement = 'dismissed' elif 'reversed' in last_paras: judgement = 'allowed' else: last_paras = ' '.join(paras[-35:]) if 'affirmed' in last_paras: judgement = 'dismissed' elif 'denied' in last_paras: judgement = 'dismissed' elif 'dismissed' in last_paras: judgement = 'dismissed' elif 'reversed' in last_paras: judgement = 'allowed' else: judgement = 'tied / unclear' bench_sub = ', J\\.' bench_sentence = [x for x in sentences if re.search(bench_sub, x)] bench_sub = 'P\\. J\\.' bench_sentence += [x for x in sentences if re.search(bench_sub, x)] bench_sub = 'Judge:' bench_sentence += [x for x in sentences if re.search(bench_sub, x)] bench_sentence = [sub.replace("P. J.", '') for sub in bench_sentence] bench_sentence = [sub.replace(", J.", '') for sub in bench_sentence] bench_sentence = [sub.replace("Trial", '') for sub in bench_sentence] bench_sentence = [sub.replace("Acting", '') for sub in bench_sentence] bench_sentence = [sub.replace("ACTING", '') for sub in bench_sentence] bench_sentence = [sub.replace("Judge", '') for sub in bench_sentence] bench = [ sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in bench_sentence ] bench = list(dict.fromkeys(bench)) bench = ", ".join(bench) appellant_sub = 'for Defendant and Appellant' appellant_sentence = [ x for x in sentences if re.search(appellant_sub, x) ] appellant_sub = 'for Defendant/Appellant' appellant_sentence += [ x for x in sentences if re.search(appellant_sub, x) ] appellant_sub = 'for\nDefendant' appellant_sentence += [ x for x in sentences if re.search(appellant_sub, x) ] appellant_sub = 'for Petitioner' appellant_sentence += [ x for x in sentences if re.search(appellant_sub, x) ] appellant_sub = 'for Defendant' appellant_sentence += [ x for x in sentences if re.search(appellant_sub, x) ] appellant_sentence = [ sub.replace("for", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("Counsel", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("Appellant", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("and", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("Petitioner", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("Defendant", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("Respondent", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("Appeal", '') for sub in appellant_sentence ] appellant_sentence = [ sub.replace("under appointment by the Court of", '') for sub in appellant_sentence ] petitioner_counsel = [ sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in appellant_sentence ] petitioner_counsel = list(dict.fromkeys(petitioner_counsel)) respondent_sub = 'for Plaintiff and Respondent' respondent_sentence = [ x for x in sentences if re.search(respondent_sub, x) ] respondent_sub = 'for Respondent' respondent_sentence += [ x for x in sentences if re.search(respondent_sub, x) ] respondent_sub = 'for Plaintiff' respondent_sentence += [ x for x in sentences if re.search(respondent_sub, x) ] respondent_sub = 'for\nPlaintiff' respondent_sentence += [ x for x in sentences if re.search(respondent_sub, x) ] respondent_sub = 'for Plaintiff/Respondent' respondent_sentence += [ x for x in sentences if re.search(respondent_sub, x) ] respondent_sentence = [ sub.replace("for", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("Counsel", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("Respondent", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("and", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("Plaintiff", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("petitioner", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("Defendant", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("Appellant", '') for sub in respondent_sentence ] respondent_sentence = [ sub.replace("under appointment by the Court of", '') for sub in respondent_sentence ] respondent_counsel = [ sub.translate(str.maketrans('', '', string.punctuation)).strip() for sub in respondent_sentence ] respondent_counsel = list(dict.fromkeys(respondent_counsel)) items['source'] = source items['url'] = response.request.url items['petitioner'] = petitioner items['respondent'] = respondent items['date'] = date items['month'] = month items['year'] = year items['bench'] = bench items['judgement'] = judgement items['judgement_text'] = judgement_text items['petitioner_counsel'] = petitioner_counsel items['respondent_counsel'] = respondent_counsel items['title'] = respondent + ' v. ' + petitioner print("...") yield (items)
from sortFile import SortFile tesseract: Tesseract = Tesseract() imageToText: ImageToText = TessaractAdapter(tesseract) allFiles = os.listdir("images") fileList = [] for file in allFiles: try: txt = imageToText.toText(file) dates = datefinder.find_dates(txt) dateSet = set(dates) for d in dateSet: formatedDate = d.strftime("%Y-%m-%d") tempDict = {'fileName': file, 'date': formatedDate} fileList.append(tempDict) except: print("error") sortFile = SortFile()
def date_finder_add(self, text): dates = df.find_dates(text) list_date = [] for date in dates: list_date.append(datetime.date(date.year, date.month, date.day)) return list_date
org_report_list = [ file for file in os.listdir(org_report_folder) if file.endswith('.docx') ] report_index = [] for r_i, report_name in enumerate(org_report_list): f = open(os.path.join(org_report_folder, report_name), 'rb') document = Document(f) f.close() para_first = document.paragraphs[0] if len(para_first.text.split()) < 2: para_first.text = 'XXXX' for p_i, para in enumerate(document.paragraphs): matches = list( datefinder.find_dates(para.text, source=True, index=True)) if len(matches) > 0: for date_t in matches: match_date = re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', date_t[1]) if match_date: document.paragraphs[p_i].text = document.paragraphs[ p_i].text.replace(date_t[1], 'XX/XX/XXXX') print(date_t[1] + ' in ' + report_name) if 'Dictated By:' in para.text: para.text = 'Dictated by: XXXX' # print(document) document.save(os.path.join(output_report_folder, '%05d' % r_i + '.docx')) report_index.append('%05d' % r_i + '.docx;' + report_name + '\n')
for j in range(len(response['relations'][i]['arguments'])): if response['relations'][i]['arguments'][j]['entities'][0]['type'] == "Person": string_to_add = response['relations'][i]['arguments'][j]['entities'][0]['text'] if string_to_add == "j\u2019": string_to_add = tweet._json['entities']['user_mentions'][0]['screen_name'] person += " " + string_to_add elif response['relations'][i]['arguments'][j]['entities'][0]['type'] == "Location" or \ response['relations'][i]['arguments'][j]['entities'][0][ 'type'] == "GeopoliticalEntity": location = " " + response['relations'][i]['arguments'][j]['entities'][0]['text'] except: continue try: date = datetime.datetime.min date_list = datefinder.find_dates(tweet.text.split('https')[0]) for match in date_list: date = match break except ValueError: date = datetime.datetime.min if date > datetime.datetime.now() and int(date.year) < 2021: if date != "" or person != "" or location != "": hashtags = "" for elem in tweet._json['entities']['hashtags']: hashtags += " " + elem['text'] entry = {'tweet': tweet._json, 'date': str(date), 'location': location, 'Personnes': person, 'Hashtags': hashtags, 'text': tweet.text} conferencejson['conference'].append(entry) print(entry)
def extractevent(): event_types = [ "coffee night", "job fairs", "career fairs", "career fair", "tech talk", "alumni connection", "lecture", "Birthday", "Meeting", "Seminar", "Party", "Anniversary", "Marriage", "Appointment", "Meet", "sports", "career fair", "Workshop" ] event_dates = [ "Tomorrow", "Today", "Day After Tomorrow", "Next Month", "Next Week" ] csvFile = pd.read_csv('CSV_NAME.csv') messages = csvFile['Message_body'] senders = csvFile['Sender'] subjects = csvFile['Subject'] empevents = [] empdates = [] event_list = [] msg_counter = 0 for index, row in messages.iteritems(): i = 0 event_dict = {} str1 = ''.join(row) str1 = str1.replace("-", " ") str1 = str1.replace("|", " ") for event_type in event_types: if event_type.lower() in str1.lower(): flag = False for event_date in event_dates: if event_date.lower() in str1.lower(): convertedDate = rawday2date.getDate(event_date.lower()) json_event_date = convertedDate.date() matches = datefinder.find_dates(str1) for match in matches: if match.time(): if i == 0: json_event_startTime = match.time() elif i == 1: json_event_endTime = match.time() i += 1 flag = True break if not flag: matches = datefinder.find_dates(str1) for match in matches: if i == 0: json_event_date = match.date() json_event_startTime = match.time() elif i == 1: json_event_endTime = match.time() i += 1 json_event_type = event_type json_event_sender = senders.iloc[msg_counter] json_event_subject = subjects.iloc[msg_counter] if i == 0: json_event_startTime = "00:00:00" json_event_endTime = "00:00:00" elif i == 1: json_event_endTime = "00:00:00" event_dict["type"] = json_event_type event_dict["date"] = str(json_event_date) event_dict["stime"] = str(json_event_startTime) event_dict["etime"] = str(json_event_endTime) event_dict["title"] = json_event_sender event_dict["desc"] = json_event_subject event_list.append(event_dict) break msg_counter += 1 for empdate in empdates: print(empdate.date()) for event in event_list: print(event) return event_list
#"01-03 11:16:21", #"8月15日 22:46", #"01-03 11:16", #"7/3", #"5月11日", #"3 秒前", #"29 分钟前", #"2 小时前", #"2天前", #"今天 15:42:21", #"昨天 15:42:21", #"前天 10:41:21", #"今天 15:42", #"昨天 15:42", #"前天 10:41", #]""" # 识别不准确 #"昨天 15:42", 识别为今天。。。 可能就没这样的判断 date_generate = datefinder.find_dates(text) # for date in date_generate: print(date)
def parse(self, response): locale.setlocale(locale.LC_ALL, '') result_queue = getattr(self, 'result_queue', None) if response.status == 404: result_queue[0] = 404 self.log('result from parse: {}'.format(result_queue[0])) raise CloseSpider('Такого логина нет') next_page = response.css('li.next a::attr(href)').get() post_previews = response.css('div.post-item') self.postsCount += len(post_previews) for post_preview in post_previews: preview = PostPreview() preview['id'] = post_preview.css( 'div.post-item::attr(id)').get().split('-')[1] preview['title'] = unquote( post_preview.css( 'div.post-item__header a.post-item__title-link::text').get( )).strip('\n').strip() preview['link'] = unquote( post_preview.css( 'div.post-item__header a.post-item__title-link::attr(href)' ).get()) likesCountTmp = post_preview.css( 'span.post-item__counter span.ygls-likes-count::text').get() preview[ 'likesCount'] = likesCountTmp is not None and likesCountTmp if likesCountTmp else 0 commentsCountTmp = post_preview.css( 'div.post-item__footer span a.gray::text').getall() #self.log( len(commentsCountTmp)) if (len(commentsCountTmp) > 1): preview['commentsCount'] = commentsCountTmp[1].strip( '\n').strip() else: preview['commentsCount'] = 0 preview['views'] = post_preview.css( 'span.post-item__counter span.post-views::text').get() date = post_preview.css('div.post-item__info::text').getall()[1] matches = datefinder.find_dates(date.strip()) res = re.search(r'-(.+\d\s)', date) if (res and res.group(1).find('сегодня') == -1 and res.group(1).find('вчера') == -1): strr = res.group(1) #self.log(strr) preview['creationDate'] = self.parse_date( strr ) #datetime.datetime.strptime(res.group(1).strip(), '%d %B %Y г., %H:%M').date() else: preview['creationDate'] = '-' self.postPreviews.append(preview) #self.log(preview['creationDate']) self.log(next_page) if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, cookies=self.cookies, callback=self.parse) else: yield { 'postsCount': self.postsCount, 'postPreviews': self.postPreviews }
def trade_spider(): info = ' ' url = "http://www.eventsdoha.com/white-salsa-night-a-farewell-to-dubraska-the-irish-harp-sheraton-garnd-17th-may/" source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, "html.parser") for d in soup.find_all('p'): print d.text #info = str(div_tag.text)+str(div_tag.next_sibling) #print "hello" #info.append(d.text) info = info + d.text + "\n" details = {"information": info} print "***********" results2 = db.eventinfor.insert_one(details) results2.inserted_id # for link in soup.find_all('p', ): # href = link.get('href') # print(href) return info if __name__ == "__main__": information = trade_spider() string_with_dates = information matches = datefinder.find_dates(string_with_dates) for match in matches: print match
def get_dates(document, om_df, ind, col_dict, print_info, infer_date_surrounding_rows=True): """Extract dates from the input document. This method is utilized within ``preprocessor.py``. For an easy way to extract dates, utilize the preprocessor and set extract_dates_only = True. Parameters ---------- document : str String representation of a document om_df : DataFrame A pandas dataframe containing O&M data, which contains at least the columns within col_dict. ind : integer Designates the row of the dataframe which is currently being observed. This is required because if the current row does not have a valid date in the `eventstart`, then an iterative search is conducted by first starting at the nearest rows. col_dict: dict of {str : str} A dictionary that contains the column names relevant for the get_dates fn - **data** (*string*), should be assigned to associated column which stores the text logs - **eventstart** (*string*), should be assigned to associated column which stores the log submission datetime print_info : bool Flag indicating whether to print information about the preprocessing progress infer_date_surrounding_rows : bool If True, utilizes iterative search in dataframe to infer the datetime from surrounding rows if the current row's date value is nan If False, does not utilize the base datetime. Consequentially, today's date is used to replace the missing parts of the datetime. Recommendation: set True if you frequently publish documents and your dataframe is ordered chronologically Returns ------- list List of dates found in text """ DATA_COLUMN = col_dict["data"] EVENTSTART_COLUMN = col_dict["eventstart"] try: row = om_df.iloc[ind] if print_info: print("Start time: ", row[EVENTSTART_COLUMN]) no_base_date_found = False if isinstance(row[EVENTSTART_COLUMN], float) and np.isnan( row[EVENTSTART_COLUMN]): # Was given a NaN value as event start date, so look before an after this row for a date if infer_date_surrounding_rows: no_base_date_found = True else: if print_info: print("found nan") find_valid = False w = 1 om_df_len = len(om_df.index) while find_valid is False and no_base_date_found is False: ind_behind = ind - w ind_ahead = ind + w if ind_behind >= 0: if print_info: print("checking index: ", ind_behind) row_behind = om_df.iloc[ind_behind] if isinstance(row_behind[EVENTSTART_COLUMN], float) and np.isnan( row_behind[EVENTSTART_COLUMN]): pass else: basedate = list( datefinder.find_dates( row_behind[EVENTSTART_COLUMN]))[0] find_valid = True continue if ind_ahead < om_df_len: if print_info: print("checking index: ", ind_ahead) row_ahead = om_df.iloc[ind_ahead] if isinstance(row_ahead[EVENTSTART_COLUMN], float) and np.isnan( row_ahead[EVENTSTART_COLUMN]): pass else: basedate = list( datefinder.find_dates( row_ahead[EVENTSTART_COLUMN]))[0] find_valid = True continue # not needed but consistent syntax if ind_ahead > om_df_len and ind_behind < 0: no_base_date_found = True w += 1 else: basedate = list(datefinder.find_dates(row[EVENTSTART_COLUMN]))[0] if no_base_date_found: matches = list(datefinder.find_dates(document)) else: matches = list(datefinder.find_dates(document, base_date=basedate)) except Exception as e: matches = [] if print_info: print(traceback.format_exc()) print("\n") print("date") print(row[EVENTSTART_COLUMN]) print("proc") print(document) print("raw") print(om_df.iloc[[ind]][DATA_COLUMN].tolist()[0]) print(ind) print(e) print(traceback.format_exc()) valid_matches = [] # valid_inds = [] for mtch in matches: try: if (mtch > datetime.strptime("01/01/1970", "%m/%d/%Y")) and ( mtch < datetime.now() + timedelta(days=365 * 100)): valid_matches.append(mtch) except Exception as e: if print_info: print(e) return valid_matches
def Exp(resume_path): resume_text = open(resume_path, 'r') ps = PorterStemmer() lem = WordNetLemmatizer() wordindex = [] resume_lemm = [] temp = [] list_lemmatize = [] for i in resume_text: resume_text = resume_text.replace("-", " t ") import re resume_regex = [re.sub('[^a-zA-Z0-9/]+', '', _) for _ in resume_text.split()] for i in resume_regex: resume_lemm.append(ps.stem(i)) list1 = ["Education" ,"Skills", "STRENGTH" , "Achievements", "Contact", "Technical", "Projects", "Address", 'Academic'] for i in list1: list_lemmatize.append(ps.stem(i)) for st in list_lemmatize: start = resume_lemm.index("experi") if st in resume_lemm: end_temp = resume_lemm.index(st) if start < end_temp: end = resume_lemm.index(st) temp.append(resume_text[start : end]) wordindex.append(resume_lemm.index(st)) else: continue wordindex.sort() index = [i for i in wordindex if i > start ] find_date1 = resume_lemm[start:index[0]] for st in find_date1: find_date1 = [str(date.today()) if st == "current" else st for st in find_date1] find_date1 = [str(date.today()) if st == "present" else st for st in find_date1] find_date1 = [str(date.today()) if st == "now" else st for st in find_date1] find_date1 = [str(date.today()) if st == "till date" else st for st in find_date1] find_date1 = ["jan" if st == "januari" else st for st in find_date1] find_date1 = ["feb" if st == "februari" else st for st in find_date1] find_date1 = ["july" if st == "juli" else st for st in find_date1] find_date1 = ["sep" if st == "septemb" else st for st in find_date1] find_date1 = ["oct" if st == "octob" else st for st in find_date1] find_date1 = ["nov" if st == "novemb" else st for st in find_date1] find_date1 = ["dec" if st == "decemb" else st for st in find_date1] date_string = ' '.join(find_date1) matches = datefinder.find_dates(str(date_string)) dates_list = list(matches) for i in range(len(dates_list)): dates_list[i] = dates_list[i].date() j = 0 total_days = 0 if len(dates_list) == 1: dates_list.append(date.today()) for i in dates_list: total_days = total_days + (dates_list[j+1] - dates_list[j]).days j = j+2 if j > (len(dates_list)-1): break totalex = round((total_days/365),1) return totalex resume_path.close()
def get_date(str_date): for g in datefinder.find_dates(str(str_date)): date = g date = f"{date.year} {date.strftime('%b')} {date.day}" return date
def date_conversion(dataset): return dataset['timestamp'].apply(lambda x: [ pd.to_datetime(str(i).split(' ')[0]) for i in datefinder.find_dates(str(x)) ][0])
} # create a connection with the database and query the data cursor = conn.cursor() cursor.execute("SELECT * FROM sorting_items ORDER BY position_order") rows = cursor.fetchall() print('Total Row(s):', cursor.rowcount) data = dict() years_dict = dict() i = 0 j = 0 for row in rows: # print (row[2]) matches = datefinder.find_dates(row[2]) # print (matches) for match in matches: # print (match) datee = datetime.datetime.strptime(str(match), "%Y-%m-%d %H:%M:%S") # print (datee.month) # data.update({str(row[1]):str(datee.month)}) # data[row[1]].append(datee.month) if datee.month == 1: if not row[1] in d['1']: d['1'].append(row[1]) elif datee.month == 2: if not row[1] in d['2']: d['2'].append(row[1])
def extract_day(sentence): date = datefinder.find_dates(sentence) if len(date) >= 1: return date[0].strftime('%A') return datetime.datetime.now().strftime('%A')
def format_date(self, date): matches = datefinder.find_dates(date) for match in matches: return (match.strftime("%Y-%m-%d"))
def scrape(request): #def scrape(): p_start = 309518 p_end = 309532 session = requests.Session() session.max_redirects = 3 session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } base_url = "https://www.hotnigerianjobs.com/hotjobs/" BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sample_url = os.path.join(BASE_DIR, 'techjobserver/sample') for page in range(0, 2): """ present_url = base_url + str(page) print('\n scraping ' + present_url +'\n') time.sleep(20) page_req = session.get(present_url, verify=False) job_status = page_req.status_code print(job_status) content = page_req.content soup = BSoup(content, "html.parser") """ i = 1 soup = BSoup(open(sample_url + str(page) + '.html')) Jobs = soup.find('div', {"class": "middlecol"}) job_case = Jobs.find_all('div', {'class': 'mycase'})[0] job_title = job_case.find('span', {'class': 'jobheader'}) desc = job_case.find('div', {'class': 'mycase4'}) desc = desc.find_all('div')[1] job_description = desc.find_all('div') job_intro = desc.find_all('p')[0] job_link = job_title.find('a') job_link = job_link['href'] post_date = job_case.find('span', {'class': 'semibio'}) matches = datefinder.find_dates(post_date.text) for match in matches: post_date = match.strftime("%A %b %d, %Y") print(post_date) try: raw_title = job_title.text.lower() except AttributeError: continue print(raw_title) title_list = ( "Backend Developer", "Backend Engineer", "Business Analyst", "Business Intelligence Analyst", "Chief Information Officer", "CIO", "Cloud Architect", "Cloud Engineer", "Computer Analyst", "Computer Engineer", "Cyber Security Analyst", "Data Analyst", "Data Architect", "Data Entry", "Data Scientist", "Data Engineer", "Network Administrator", "Database Administrator", "DevOps", "DevOps Engineer", "Engineer", "Frontend Developer", "Frontend Engineer", "Fullstack Developer", "Fullstack Engineer", "Graphics Designer", "Hardware", "Information Security Analyst", "Information Security Consultant", "IT Director", "IT Manager", "IT Technician", "Mobile Developer", "Mobile App Developer", "Network Engineer", "Network Manager", "Network Technician", "Product Manager", "Programmer", "Project Manager", "Quality Assurance Tester", "QA Analyst", " Quality Assurance Engineer", "React Developer", "Sales Engineer", "Salesforce Administrator", "Site Reliability Engineer", "Software Quality Assurance Analyst", "Software Developers", "Software Engineer", "Software Support", "Software Tester", "System Administrator", "Systems Analyst", "Systems Engineer", "Technical Designer", "Technical Engineer", "Technical Lead", "Technical Product Manager", "Technical Project Manager", "Technical Sales", "Technical Support", "UI/UX", "UI/UX Designer") raw_list = (x.lower() for x in title_list) if any(word in raw_title for word in raw_list): print('\n scraping ' + job_link + '\n') #for child in soup.recursiveChildGenerator(): job_description = str(job_description) job_description = html.escape(job_description) job_load = { "job_title": job_title.text, "job_description": job_description, "job_intro": job_intro.text, "job_date": post_date, "job_link": job_link } jobscraper_serializer = JobscraperSerializer(data=job_load) if jobscraper_serializer.is_valid(): jobscraper_serializer.save() i += 1 if i == 2: jobscraper = ITjob.objects.all() job_title = request.GET.get('job_title', None) if job_title is not None: jobscraper = jobscraper.filter( job_title__icontains=job_title) jobscraper_serializer = JobscraperSerializer(jobscraper, many=True) return JsonResponse(jobscraper_serializer.data, safe=False, status=status.HTTP_201_CREATED) else: print(jobscraper_serializer.errors)
#Importing the user input image imageFileName = input("enter the image name with absolute path ") # In[31]: #openin the image file path that we have given img = Image.open(imageFileName) # In[32]: #coverting the image text into string text = tss.image_to_string(img) # In[33]: #printing the converted text print(text) # In[35]: # By using datefinder funtion we are extracting dates from text matches = datefinder.find_dates(text) # In[36]: for match in matches: print(match) # In[ ]:
def parse_data(self, file=None): if not file: file = self.file text_pdf = self.pdf2text(file) log(log.INFO, "Read pdf file Union") if not text_pdf: viewer = SimplePDFViewer(file) for canvas in viewer: text_pdf += "".join(canvas.strings) log(log.INFO, "Get pdf text Union") matches = datefinder.find_dates(text_pdf) COUNT_FIND_DATE = 2 date = datetime.now() for i, match in enumerate(matches): date = match if i >= COUNT_FIND_DATE: break last_skip_word = "% Chg" try: skip_index = text_pdf.rindex(last_skip_word) + len(last_skip_word) except ValueError: skip_index = 0 text_pdf = text_pdf[skip_index:].strip() PATTERN = ( r"(?P<name>[a-zA-Z0-9_\ \(\)\.\&\,\-]+)\s+" r"(?P<w_current_year>[0-9\,]+)\s+" r"(?P<w_previous_year>[0-9\,]+)\s+" r"(?P<w_chg>[0-9\.\%\-\(\)]+)\s+" r"(?P<q_current_year>[0-9\,]+)\s+" r"(?P<q_previous_year>[0-9\,]+)\s+" r"(?P<q_chg>[0-9\.\%\-\(\)]+)\s+" r"(?P<y_current_year>[0-9\,]+)\s+" r"(?P<y_previous_year>[0-9\,]+)\s+" r"(?P<y_chg>[0-9\.\%\-\(\)]+)" ) # list of all products products = {} for line in re.finditer(PATTERN, text_pdf): products[line["name"].strip()] = dict( week=dict( current_year=get_int_val(line["w_current_year"]), previous_year=get_int_val(line["w_previous_year"]), chg=line["w_chg"], ), QUARTER_TO_DATE=dict( current_year=get_int_val(line["q_current_year"]), previous_year=get_int_val(line["q_previous_year"]), chg=line["q_chg"], ), YEAR_TO_DATE=dict( current_year=get_int_val(line["y_current_year"]), previous_year=get_int_val(line["y_previous_year"]), chg=line["y_chg"], ), ) for prod_name in products: company_id = "" carload_id = find_carload_id(prod_name) company_id = f"Union_Pacific_{self.year_no}_{self.week_no}_{carload_id}" company = Company.query.filter( and_( Company.company_id == company_id, Company.product_type == prod_name ) ).first() if not company and carload_id is not None: Company( company_id=company_id, carloads=products[prod_name]["week"]["current_year"], YOYCarloads=products[prod_name]["week"]["current_year"] - products[prod_name]["week"]["previous_year"], QTDCarloads=products[prod_name]["QUARTER_TO_DATE"]["current_year"], YOYQTDCarloads=products[prod_name]["QUARTER_TO_DATE"][ "current_year" ] - products[prod_name]["QUARTER_TO_DATE"]["previous_year"], YTDCarloads=products[prod_name]["YEAR_TO_DATE"]["current_year"], YOYYDCarloads=products[prod_name]["YEAR_TO_DATE"]["current_year"] - products[prod_name]["YEAR_TO_DATE"]["previous_year"], date=date, week=self.week_no, year=self.year_no, company_name="UNION", carload_id=carload_id, product_type=prod_name, ).save() log(log.INFO, "Write data to the database UNION")
def upload(): headers_post = request.headers appid = headers_post['appId'] tenant_id = headers_post['X-TenantID'] object_id = headers_post['X-Object'] Authorization = headers_post['Authorization'] # dct_prop = dict(line.strip().split('=') for line in open('properties.txt')) URL = os.getenv("properties_url") response1 = requests.get(URL, headers={ "Content-Type": "application/json", "X-TenantID": tenant_id }) ENV_URL = response1.json() ENV_URL = ENV_URL["propertyValue"] vf_url = str(ENV_URL) + "/cac-security/api/userinfo" response = requests.get( vf_url, headers={"Authorization": Authorization} ) # df1=pd.read_csv("C:/Users/kartik.patnaik/Desktop/mobileapp/new_test/stanford-ner-2018-10-16/train2/Book1.csv") if response.status_code == 200: ROOT_PATH = os.getenv("path_root_url") os.chdir(ROOT_PATH) df2 = str(tenant_id) + "/" + str(appid) + "/" + str(object_id) user_input = request.get_json() if user_input != {}: wanted_keys = ['sentence'] wanted_keys1 = ['Tags'] sentence = { k: user_input[k] for k in set(wanted_keys) & set(user_input.keys()) } sentence = list(sentence.values())[0] if sentence != None and sentence != '': sentence = sentence.lower() article = sentence[:] def find_match(sentence, df): for i in range(df.shape[0]): if sentence.find(df['rpl'][i]) != -1: sentence = sentence[:sentence.find( df['rpl'][i])] + df['rpl1'][ i] + sentence[sentence.find(df['rpl'][i]) + len(df['rpl'][i]):] return sentence ls3 = list(datefinder.find_dates(sentence, source=True)) if ls3 != []: ls4 = pd.DataFrame(ls3) ls4.columns = ["rpl1", "rpl"] ls4["rpl1"] = ls4["rpl1"].dt.strftime('%Y-%m-%d') sentence = find_match(article, ls4) tags = { k: user_input[k] for k in set(wanted_keys1) & set(user_input.keys()) } tags = list(tags.values())[0] def lower_dict(d): new_dict = dict((k, v.lower()) for k, v in d.items()) return new_dict tags = lower_dict(tags) new_list = [] for key, value in tags.items(): new_list.append([key, value]) ui1 = pd.DataFrame(new_list) ui1.columns = ['action', 'sentence'] #ui2 = ui1.sentence.str.split(expand=True,) ui1[['sentence1', 'sentence2']] = ui1['sentence'].str.split(' ', n=1, expand=True) ui2 = ui1[['sentence1', 'action']] ui3 = ui1[['sentence2', 'action']] #ui3.dropna(subset=['action'],inplace = True) ui3.dropna(inplace=True) ui2.columns = ['sentence', 'action'] ui3.columns = ['sentence', 'action'] ui4 = ui2.append(ui3, ignore_index=True) lst_ip1 = nltk.word_tokenize(sentence) lst_ip3 = pd.DataFrame(lst_ip1) lst_ip3.columns = ['sentence'] #################################################join result = pd.merge(lst_ip3, ui4, on='sentence', how='left') result['action'] = result['action'].fillna('o') result['sentence'] = result['sentence'].map( str) + " " + result["action"] user_input3 = result['sentence'] user_input3.to_csv(str(df2) + '/user_input3.tsv', header=False, index=False) user_input3 = pd.read_csv(str(df2) + '/user_input3.tsv', sep='\t', header=None) exists = os.path.isfile(str(df2) + '/dummy-corpus1.tsv') exists1 = os.path.isfile(str(df2) + '/dummy-corpus2.tsv') if exists and not exists1: pa1 = pd.read_csv(str(df2) + '/dummy-corpus1.tsv', sep='\t', header=None) pa2 = pa1.append(user_input3, ignore_index=True) pa2 = pa2.append([". o"]) elif exists1 and exists: pa1 = pd.read_csv(str(df2) + '/dummy-corpus2.tsv', sep='\t', header=None) pa2 = pa1.append(user_input3, ignore_index=True) pa2 = pa2.append([". o"]) else: pa2 = user_input3 pa2 = pa2.append([". o"]) pa2.to_csv(str(df2) + '/dummy-corpus2.tsv', header=False, index=False) cwd = os.getcwd() cwd = pathlib.PureWindowsPath(cwd) cwd = cwd.as_posix() prop = "trainFile = " + str(cwd) + "/" + str( df2) + """/dummy-corpus2.tsv serializeTo =""" + str(cwd) + "/" + str( df2) + """/corpus-tagging.ser.gz map = word=0,answer=1 useClassFeature=true useWord=true useNGrams=true noMidNGrams=true maxNGramLeng=6 usePrev=true useNext=true useSequences=true usePrevSequences=true maxLeft=1 useTypeSeqs=true useTypeSeqs2=true useTypeySequences=true wordShape=chris2useLC useDisjunctive=true""" file = open(str(cwd) + "/" + str(df2) + '/prop2.txt', 'w') file.write(prop) file.close() myCmd = 'java -jar stanford-ner.jar -mx4g -prop' " " + str( df2) + '/prop2.txt' os.system(myCmd) return 'Recurrent Training on Completed Successfully' else: return 'No Data to be trained on NULL' else: return 'Unsuccessful Auth'
def __svo_senti_from_article(self, article, subject=None): title = article[0:article.find('(title_end)')] try: date = list(datefinder.find_dates(article))[-1] except: date = None sentences = self.__sentence_split(article) val1 = [] val2 = [] for sent in sentences: val1.append(self.__sentimentAnalysis(sent)) val2.append(self.__get_svo(sent)) result = pd.merge(pd.DataFrame(val1), pd.DataFrame(val2), on='Sentence')[[ 'Sentence', 'Names', 'Persons', 'Organizations', 'Facilities', 'Locations', 'Subjects', 'Predicates', 'Objects', 'compound', 'Event_date' ]] result.rename(columns={'compound': 'Sentiment'}, inplace=True) # try: # result['date']=date # except: # result['date']='-----' result['Article_date'] = date result['Article_title'] = title def correctdate(eventdate, articledate): if eventdate is None: return None if articledate is None: return None try: corrected_date = parse(eventdate, settings={'RELATIVE_BASE': articledate}) except: corrected_date = None return corrected_date result['Event_date'] = result['Event_date'].apply( lambda x: correctdate(x, date)) # try: # result.loc[result['date']> datetime.datetime.today() + datetime.timedelta(days=1),'date']='-----' # except: # pass result = result.drop_duplicates(subset=['Sentence'], keep='first') # remove duplicate rows ''' ###emolex start def getEmolex(word): wordlist=re.findall(r'\w+', word) wordlist=[e.lower() for e in wordlist] df=pd.DataFrame(columns=list(self.emolexdict['type'].unique())) dflist=[] for e in wordlist: temp=self.emolexdict[self.emolexdict['word']==e] pivot=temp.pivot(index='word', columns='type', values='Weight').reset_index() dflist.append(pivot) result=pd.concat(dflist) features=list(result) features.remove('word') df[features]=result[features] df['Sentence']=word final=df.groupby('Sentence').apply(np.mean).reset_index() return final emolex_all=[] for sent in result['Sentence']: dft=getEmolex(sent) emolex_all.append(dft) result_emolex=pd.concat(emolex_all) result=result.join(result_emolex.set_index('Sentence'),on='Sentence') ###emolex end ''' if subject is None: return result else: return result[result['Names'].apply(lambda x: subject in x)]
for i in range(len(df)): if len(df[i]) * len(df[i].columns) > maxI: maxI = len(df[i]) * len(df[i].columns) bestIndex = i dfOfInterest = df[bestIndex] dfOfInterest = pd.DataFrame(dfOfInterest).to_numpy() datesArray = [] for i in range(len(dfOfInterest)): for j in range(len(dfOfInterest[i])): matches = list(datefinder.find_dates(str(dfOfInterest[i][j]))) print(matches) print((i, j)) ''' if is_date(str(dfOfInterest[i][j])): datesArray.append((dfOfInterest[i][j], (i,j))) ''' ''' # Contains tuples of the dates text and where the index of that date # is in textArray to be used for parsing later datesArray = [] for i in range(len(textArray)): if is_date(textArray[i]): datesArray.append((textArray[i], i))
def __get_svo(self, sentence): ''' get SVO of single sentence ''' parsed_phrase = self.__nlp(sentence) names = list(parsed_phrase.ents) corrected_names = [] persons = [] locations = [] organizations = [] event_date = [] norp = [] facilities = [] events = [] cities = [] for e in names: if e.label_ == 'GPE' or e.label == 'LOC' or e.label_ == 'PERSON' or e.label_ == 'ORG' or e.label == 'NORP' \ or e.label == 'FACILITY' or e.label == 'PRODUCT': corrected_names.append(e.text) if e.label_ == 'GPE' or e.label == 'LOC': locations.append(e.text) # if e.text.lower() in self.allcities: # detect cities, slowdone the speed # cities.append(e.text) if e.label_ == 'PERSON': persons.append(e.text) if e.label_ == 'ORG': organizations.append(e.text) if e.label == 'NORP': norp.append(e.text) if e.label == 'FACILITY' or e.label == 'PRODUCT': facilities.append(e.text) if e.label == 'EVENT': events.append(e.text) subjects = [] objects = [] verbs = [] for text in parsed_phrase: if text.dep_.startswith("nsubj") or text.dep_ in ['conj']: subject = text.orth_ subjects.append(subject) if text.dep_ in ["dobj", 'pobj', 'iobj']: object_ = text.orth_ objects.append(object_) if text.pos_ == "VERB" and text.lemma_ in self.__keyverbs: verb = text.lemma_ verbs.append(verb) # event date try: event_date = list( set(sentence.replace('.', '').split()) & { 'Monday', 'Tuesday', 'Wednesday', 'Tursday', 'Friday', 'Saturday', 'Sunday', 'Today', 'today', 'Tomorrow', 'tomorrow', 'Yesterday', 'yesterday' })[0] except: try: event_date = list(datefinder.find_dates(sentence))[0] if str(event_date.year) not in sentence: event_date = str(event_date.month) + '/' + str( event_date.day) event_date = str(event_date) except: event_date = None # correct subject and object corrected_subjects = [] corrected_objects = [] corrected_names_copy = list(corrected_names) for sub in subjects: for name in corrected_names_copy: if sub in name: corrected_subjects.append(name) corrected_names_copy.remove(name) break for obj in objects: for name in corrected_names_copy: if obj in name: corrected_objects.append(name) corrected_names_copy.remove(name) break return { 'Sentence': sentence, 'Subjects': corrected_subjects, 'Predicates': verbs, 'Objects': corrected_objects, 'Names': corrected_names, 'Event_date': event_date, 'Persons': persons, 'Locations': locations, # 'Cities': cities, 'Organizations': organizations, 'NORP': norp, 'Facilities': facilities, 'Events': events }
def date(self): new_text = re.sub('([0-9]+)[\s-]years?[\s-]old', '', self.description) dates = datefinder.find_dates(new_text) # datefinder is throwing false positives for 38-years-old return dates.next().strftime("%B %d, %Y")
break ## extracting email from string match = re.search(r'[\w\.-]+@[\w\.-]+', text) email = match.group(0) # print(email) #date recieved = 'received' for idx, text in enumerate(string): if recieved in text.lower(): # print(idx+1, string[idx+1]) break ## extracting date from string matches = datefinder.find_dates(string[idx + 1]) for match in matches: date = match.strftime('%m/%d/%Y') # print match #retailer information retailers = ['nordstrom'] #getting retailer information for num in retailers: for idx, text in enumerate(string): if num in text.lower(): retailer = num break # print retailer