def get_date(clean_words, question): date1 = '' date2 = '' if "yesterday" in clean_words: date1 = date.today() - timedelta(1) date1 = date1.strftime("%Y-%m-%d") elif "today" in clean_words: date1 = date.today().strftime("%Y-%m-%d") if date1 == '': question = re.sub(r"(?<=\d)(st|nd|rd|th)\b", '', question) dates = extract_dates(question) try: date1 = dates[0].strftime("%Y-%m-%d") date2 = dates[1].strftime("%Y-%m-%d") except: pass date_req = [] if date1 == '': return ("Unable to identify the required day for query") else: date_req.append(date1) date_req.append(date2) return date_req
def date_extract(text): # matches = datefinder.find_dates(text) # matches = dparser.parse(text, fuzzy=True) dates = extract_dates(text, return_precision=True) return dates
def extract_date(text): dates = extract_dates(text) for d in dates: if d is not None: temp = int(str(d).split('-')[0]) if temp in [2018, 2019, 2020, 2017, 2016]: print('Date : ', str(d)[:10]) return str(d)[:10]
def extract_DOB(text): Dob="" rawdata="" textSplit=text.splitlines() count=len(textSplit) i=0 for data in textSplit: pattern=re.compile('Date of Birth|dob|DOB|D.O.B|d.o.b|date of birth|Date of birth|dateofbirth|DATE OF BIRTH|Date of Birth') temp=pattern.findall(data) i=i+1 if len(temp)!=0: rawdata=data if i<count: rawdata=rawdata+""+textSplit[i] final=count-i if final>=2: rawdata=rawdata+""+textSplit[i+1] rawdata=rawdata.replace("?","") rawdata=" ".join(rawdata.split()) break if rawdata != "": matches=datefinder.find_dates(rawdata) month="" day="" for match in matches: Dob=match tempmonth=len(str(Dob.month)) tempday=len(str(Dob.day)) if tempmonth!=2: month=str(0)+str(Dob.month) else: month=str(Dob.month) if tempday!=2: day=str(0)+str(Dob.day) else: day=str(Dob.day) Dob=day+"."+ month +"."+str(Dob.year) break if Dob=="": date=extract_dates(rawdata) for match in date: Dob=match Dob=str(Dob.day)+"-"+str(Dob.month)+"-"+str(Dob.year) break return Dob else: return ""
def get_dates(self): # https://spacy.io/usage/linguistic-features#101 ents = [ent for ent in self.doc.ents if ent.label_ == 'DATE'] for ent in self.doc.ents: if ent.label_ == "01/04/1937": pass dates = list() for ent in ents: date = dateparser.parse(ent.text) dates.append(date) if not dates: extracted_dates = extract_dates(self.doc.text) dates += extracted_dates return dates
def extract_dol(lines): '''take list as input break it into lines and extract date from each lines if present ''' dates_arr = [] for line in lines: dates = date_extractor.extract_dates(line) for date in dates: dates_arr.append(date.date()) return dates_arr
def get_dates(url): try: date_str = '' html_doc = requests.get(url).text text_doc = BeautifulSoup(html_doc, 'html.parser').text dates = extract_dates(text_doc) for d in dates: if isinstance(d, datetime.datetime): date_str += d.strftime('%Y-%m-%d') + '\n' except RequestException as error: date_str = 'Request error: {}'.format(error) print('Request error: {}'.format(error)) except BaseException as error: date_str = 'error: {}'.format(error) print('error: {}'.format(error)) return date_str
def ParseDateStr(dstr): if len(dstr) == 0: return '' dstr = ' ' + dstr.lower() + ' ' dstr = resub('[\W\d](th|st|nd|rd)\W', '', dstr).strip() try: dobjs = extract_dates(dstr, return_precision=True, debug=False) for o in dobjs: if o[1] == 'day': dobj = o[0].replace(tzinfo=None) if dobj <= cdate: return dobj except: return '' return ''
def dateExtractor(filename): """ This function will handle extracting date from text. """ try: text = textExtraction(filename) date = extract_dates(text) # print((date[0])) # dateFormated=date[0].strftime("%Y-%m-%d") for i in date: if i != 'None': i = i.strftime("%Y-%m-%d") else: date.remove(i) if len(date): return {'date': date} else: return {'date': 'null'} except: return {"error": "Some error occures"}
def DATETIME_to_iso(datetime_string): formatted_dates = [] matches = extract_dates(datetime_string) for match in matches: if match == None: break formatted_dates.append(match.isoformat()) if len(formatted_dates) == 0: cal = parsedatetime.Calendar() formatted_dates = [] dates = datetime_string.split(" and ") if len(dates) == 1: dates = dates[0] dates = dates.split(" to ") for date_string in dates: time_struct, parse_status = cal.parse(date_string) date = datetime(*time_struct[:6]) formatted_dates.append(date.isoformat()) return ('/'.join(formatted_dates))
def date_extract(ls): ''' Function to extract dates from a list of words :param ls: list of words :return: dates extracted via two sources ''' date_ner = [] date_extractor = [] for i in ls: if "issue date" in i: print(i) dates = extract_dates(i) if (len(dates) != 0): print("Date Extractor:", str(dates[0].date())) date_extractor.append(dates[0]) doc = nlp(i) for j in doc.ents: if j.label_ == "DATE": print("Issue date: ", j) date_ner.append(j) return date_ner, date_extractor
def analyze(request): puncts = string.punctuation word_to_find = request.POST.get("word_input") djText = request.POST.get('text', 'default') remPunc = request.POST.get('option', 'removepunc') cap = request.POST.get('option', 'capitalize') small = request.POST.get('option', 'toSmall') upper = request.POST.get('option', 'toUpper') word_find_flag = request.POST.get('option', 'word_find') New_Line = request.POST.get('option', 'New_line') Emails = request.POST.get('option', 'Email_Address') Links = request.POST.get('option', 'Links') Passgen = request.POST.get('option', 'Password_Generator') search_word = request.POST.get('option', 'Search_word') gallery = request.POST.get('option', 'q') Suggest_word = request.POST.get('option', 'suggest_word') Sen_Analysis = request.POST.get('option', 'Sentiment') Grammar = request.POST.get('option', 'grammar') Channel = request.POST.get('option', 'suggest_youtube') books = request.POST.get('option', 'suggest_books') articles = request.POST.get('option', 'suggest_articles') lemmitizer = request.POST.get('option', 'grammar') start_pdf = request.POST.get('option', 'generate_pdf') replace_text = request.POST.get('option', 'replace') Word_cloud = request.POST.get('option', 'wordcloud') Date = request.POST.get('option', 'date') Word_frequency = request.POST.get('option', 'word_frequency') analyzed_text = "" word_status = "" countword = len(djText.split()) if word_find_flag == "word_find": if word_to_find != "": if djText.find(word_to_find) != -1: word_status = "found" word = djText.replace( word_to_find, f"""<b style="color:{"red"};">""" + word_to_find + "</b>") djText = word try: synonym_01 = get_synonyms(word_to_find) synonyms2 = random.sample(synonym_01, 4) final = "" for f in synonyms2: final += f + " , " example = get_example(word_to_find) synonyms = final + example except: synonyms = "Not Available" else: word_status = "not found" synonyms = "Text Not Found" analyzed_text = djText word_find = "Find Word = " + word_to_find synonym = format_html('<b style="color:{};">{}</b>', 'green', synonyms) result = { "analyzed_text": analyzed_text, "highlight": "Chosen word is highlighted in red colour and synonyms/examples in green colour", "purpose": word_find, "status": word_status, "synonym": synonym, "wordcount": countword, "analyze_text": True, "findWord": True } elif New_Line == "New_line": for char in djText: if char == '.': char = '\n' analyzed_text = analyzed_text + char result = { "analyzed_text": analyzed_text, "purpose": "Changes '.' to New Line", "analyze_text": True, "wordcount": countword } elif Emails == "Email_Address": regex = '^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$' lst = re.findall('\S+@+\S+', djText) tmp = "" for x in lst: if (re.search(regex, x)): tmp += x tmp += '\n' result = { "analyzed_text": tmp, "purpose": "Find All Emails", "analyze_text": True, "wordcount": countword } elif Passgen == "Password_Generator": stop_words = set(stopwords.words('english')) chars = "!£$%&*#@" ucase_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" text = re.sub(r'[^\w\s]', '', djText) token = word_tokenize(text) filtered_sentence = [] for w in token: if w not in stop_words: filtered_sentence.append(w) if len(filtered_sentence) > 0: random_word = random.choice(filtered_sentence) else: random_word = token[0] random_word = random_word.title() merge = "" for word in random_word.split(): merge+=random.choice(chars)+word[:-1]+ word[-1].upper()\ +random.choice(string.ascii_letters)+"@"+random.choice(ucase_letters)\ +random.choice(string.digits)+" " final_text = merge[:-1] result = { "analyzed_text": final_text, "purpose": "Generate password from text", "generate_text": True, "wordcount": countword } elif search_word == "Search_word": url = 'https://www.dictionary.com/browse/' headers = requests.utils.default_headers() headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36' }) req = requests.get(url + djText, headers) soup = BeautifulSoup(req.content, 'html.parser') mydivs = soup.findAll("div", {"value": "1"})[0] for tags in mydivs: meaning = tags.text wrap = textwrap.TextWrapper(width=100) word_meaning = wrap.fill(text=meaning) result = { "analyzed_text": word_meaning, "purpose": "Searched Word", "generate_text": True, "wordcount": countword } elif Suggest_word == "suggest_word": find = requests.get( f"https://www.dictionaryapi.com/api/v3/references/thesaurus/json/{djText}?key={api_key}" ) response = find.json() if len(response) == 0: print("Word Not Recognized!") else: k = [] if str(response[0]).count(" ") == 0: for j in range(len(response)): k.append(response[j]) predict = " , ".join(k) djText = predict else: dictionary = PyDictionary() testdict = dictionary.synonym(djText) suggest = " , ".join(testdict) djText = suggest wrap = textwrap.TextWrapper(width=100) suggest = wrap.fill(text=djText) result = { "analyzed_text": suggest, "purpose": "Suggested Word", "generate_text": True, "wordcount": countword } elif Sen_Analysis == "Sentiment": djText = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", djText).split()) analysis = TextBlob(djText) # set sentiment if analysis.sentiment.polarity > 0: final = str(djText) + " (Positive Text)" elif analysis.sentiment.polarity == 0: final = str(djText) + " (Neutral Text)" else: final = str(djText) + " (Negative Text)" result = { "analyzed_text": final, "purpose": "Sentiment Analysis", "analyze_text": True, "wordcount": countword } elif Grammar == "grammar": parser = GingerIt() result = parser.parse(djText) final = result["result"] if final == '': final = "Please write some text to check grammar" result = { "analyzed_text": final, "grammar": djText, "purpose": "Spelling & Grammar Check", "analyze_text": True, "wordcount": countword } elif lemmitizer == "lemmitize": wordnet_lemmatizer = WordNetLemmatizer() tokenization = nltk.word_tokenize(djText) count = True for w in tokenization: k = wordnet_lemmatizer.lemmatize(w, pos="v") if w != k: result = "{} -> {}".format( w, wordnet_lemmatizer.lemmatize(w, pos="v")) count = False if count == True: final = "No need for lemmatization" if count == False: final = "(Original word) - > (Lemmatized word)" result = { "analyzed_text": result, "highlight": final, "purpose": "Lemmatization of text", "analyze_text": True, "wordcount": countword } elif Channel == "suggest_youtube": request.session['user-input'] = djText result = { "analyzed_text": djText, "purpose": "Suggest youtube channels", "status": "Press Button To View Channel links", "find_channel": True, "generate_text": True, "wordcount": countword } elif books == "suggest_books": request.session['user-input'] = djText result = { "analyzed_text": djText, "purpose": "Search Books", "status": "Press Button To View Books", "find_books": True, "generate_text": True, "wordcount": countword } elif articles == "suggest_articles": request.session['user-input'] = djText result = { "analyzed_text": djText, "purpose": "Search Articles", "status": "Press Button To View Articles", "find_articles": True, "generate_text": True, "wordcount": countword } elif start_pdf == "generate_pdf": request.session['user-input'] = djText result = { "analyzed_text": "Check Your Pdf", "purpose": "Generate Pdf", "status": "Press Button To View Pdf", "make_pdf": True, "generate_text": True, "wordcount": countword } elif replace_text == "replace": final_text = re.sub(word_to_find, replace_input, djText) result = { "analyzed_text": final_text, "purpose": "Replacemet of text in sentence", "analyze_text": True, "wordcount": countword } elif Word_cloud == "wordcloud": cloud = WordCloud(background_color="white", max_words=200, stopwords=set(STOPWORDS)) wc = cloud.generate(djText) buf = io.BytesIO() wc.to_image().save(buf, format="png") data = base64.b64encode(buf.getbuffer()).decode("utf8") final = "data:image/png;base64,{}".format(data) result = { "analyzed_text": " ", "purpose": "Wordcloud", "my_wordcloud": final, "generate_text": True, "wordcount": countword } elif Date == "date": final = extract_dates(djText) final_text = final[0].date() result = { "analyzed_text": final_text, "purpose": "Extract Dates from text", "analyze_text": True, "wordcount": countword } elif Word_frequency == "word_frequency": input_text = djText.replace("\n", " ") djText = input_text.lower() words_dict = get_words_dict(djText) # create graph if len(words_dict) > 10: k = 10 else: k = len(words_dict) y_pos = range(0, k) bars = [] height = [] count = 0 # print and save values to graph format_spaces("word", "occurrences") for word_str, word_amount in words_dict.items(): format_spaces(word_str, word_amount) count += 1 if count <= 10: bars.append(word_str) height.append(int(word_amount)) else: pass # # Create bars plt.bar(y_pos, height) # Create names on the x-axis plt.xticks(y_pos, bars, size=9) plt.xticks(rotation='horizontal') plt.ylabel('Word Frequency', fontsize=12, labelpad=10) plt.xlabel('Words', fontsize=12, labelpad=10) fig = plt.gcf() buf = BytesIO() fig.savefig(buf, format='png') buf.seek(0) data = base64.b64encode(buf.read()) uri = urllib.parse.quote(data) final = "data:image/png;base64,{}".format(uri) result = { "analyzed_text": " ", "purpose": "Word Frequency for every word in text", "bar_graph": final, "analyze_text": True, "wordcount": countword } elif gallery == "q": request.session['user-input'] = djText result = { "analyzed_text": djText, "purpose": "Images", "status": "Press Button To View Images", "find_image": True, "generate_text": True, "wordcount": countword } elif remPunc == 'removepunc': for char in djText: if char not in puncts: analyzed_text = analyzed_text + char result = { "analyzed_text": analyzed_text, "purpose": "Remove Punctuations", "analyze_text": True, "wordcount": countword } elif cap == "capitalize": analyzed_text = djText.capitalize() result = { "analyzed_text": analyzed_text, "purpose": "Capitalize", "analyze_text": True, "wordcount": countword } elif small == "toSmall": analyzed_text = djText.lower() result = { "analyzed_text": analyzed_text, "purpose": "To Smallercase", "analyze_text": True, "wordcount": countword } elif upper == "toUpper": analyzed_text = djText.upper() result = { "analyzed_text": analyzed_text, "purpose": "To Uppercase", "analyze_text": True, "wordcount": countword } elif Links == "Links": pattern = '(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])' links = re.findall(pattern, djText, re.IGNORECASE) analyzed_text = "" i = 0 for x in links: i = i + 1 analyzed_text += f'<a href="{x}" target="_blank">Link {i}</a>' analyzed_text += '\n ' result = { "analyzed_text": analyzed_text, "purpose": "Find All Links", "analyze_text": True, "wordcount": countword } else: return HttpResponse( '''<script type="text/javascript">alert("Please select atleast one option.");</script>''' ) return render(request, 'analyze.html', result)
from date_extractor import extract_dates import datetime import pytesseract from PIL import Image import datefinder import numpy as np import datetime from datetime import date import re dtype = np.int64 pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' text = pytesseract.image_to_string( Image.open( "F:\Pycharm\Machine-Learning\Data_Extraction\Dataset\img1.jpeg")) # print(text) # text = text.replace(" ", "") dates = extract_dates(text) # x = datetime.datetime(dates) print(dates) for i in dates: print('the extracted date is', i.strftime("%d %b %Y")) break # print(text) # st = text.split("\n") # list containg the text of the image
def process_sample(index,pdf_path, database_name, table_name, system_username, corenlp_ptr, degrees_of_rotation, fp,compiled_DDMMYYYY_date_pattern, compiled_YYYYMMDD_date_pattern,compiled_MMDDYYYY_date_pattern,compiled_PHN_pat): text = p2t.convert_pdf_to_txt(pdf_path, degrees_of_rotation) #per_day_num = tuple:(PERSON[], DATE[], NUMBER[]) per_day_num = interact.annotate_ner_with_corenlp(text.replace(",",""), corenlp_ptr) #filep = open("converted_text_{}.txt".format(str(index)),"w") #filep.write(text) valid_dates = [] #each of these strip_dates function calls appends each valid date match to the valid_dates list strip_dates(per_day_num[1],compiled_DDMMYYYY_date_pattern,valid_dates, DDMMYYYY=True, MMDDYYYY = False, YYYYMMDD = False ) strip_dates(per_day_num[1],compiled_YYYYMMDD_date_pattern,valid_dates,DDMMYYYY= False, MMDDYYYY = False, YYYYMMDD = True) strip_dates(per_day_num[1],compiled_MMDDYYYY_date_pattern,valid_dates, DDMMYYYY= False,MMDDYYYY = True, YYYYMMDD = False) find_dates(text,compiled_DDMMYYYY_date_pattern,valid_dates, DDMMYYYY=True, MMDDYYYY = False, YYYYMMDD = False ) find_dates(text,compiled_YYYYMMDD_date_pattern,valid_dates,DDMMYYYY= False, MMDDYYYY = False, YYYYMMDD = True) find_dates(text,compiled_MMDDYYYY_date_pattern,valid_dates, DDMMYYYY= False,MMDDYYYY = True, YYYYMMDD = False) found_datetimes=[] for date in valid_dates: try: if 0<int(date[1])<13 and 0<int(date[2])<32 and 1900 < int(date[0])< 2018: found_datetimes.append(datetime.date(int(date[0]),int(date[1]),int(date[2]))) except Exception: continue found_datetimes = [datetime.date(int(date[0]),int(date[1]),int(date[2])) for date in valid_dates if 0<int(date[1])<13 and 0<int(date[2])<32 and 1900 < int(date[0])< 2018] extracted_dates = date_extractor.extract_dates(text) extracted_dates = [dt.date() for dt in extracted_dates if dt] found_datetimes+= extracted_dates """ #print("PERSON list :",str(per_day_num[0])) #print("CoreNLP's DATE list: ", str(per_day_num[1])) #print("NUMBER list: ", str(per_day_num[2])) #print("Regular expression's DATES list:", str(valid_dates)) #print("Datetime.date objects: ", str(found_datetimes)) #print("VALID PHN list: ", PHN_identifier(per_day_num[2],compiled_PHN_pat)) #print("PATIENT HYPOTHESIS from highest frequency: " , patient_hypothesis(per_day_num[0])) """ fp.write("{}\nTest case #{} processed: ".format(str(pdf_path),index)) fp.write("Person List: "+ str(per_day_num[0])+"\n\n") fp.write("CoreNLP's Date List: "+ str(per_day_num[1])+"\n\n") fp.write("Extracted dates with date-extractor: " + str(extracted_dates)+"\n\n") fp.write("Number list: "+ str(per_day_num[2])+"\n\n") fp.write("Verified Date List: "+ str(valid_dates)+"\n\n") fp.write("Valid PHN List: "+ str(PHN_identifier(per_day_num[2], compiled_PHN_pat))+"\n\n") db= db_interaction.make_connection_to_db(database_name, system_username) ##################################################################################################### #combining the dates in this step PHN_vs_DOB_vs_partial_name_results =db_interaction.PHN_vs_DOB_vs_partial_name_query(db, PHN_identifier(per_day_num[2],compiled_PHN_pat), found_datetimes,per_day_num[0], table_name) PHN_vs_DOB_results = db_interaction.PHN_vs_DOB_query(db, PHN_identifier(per_day_num[2],compiled_PHN_pat), found_datetimes, table_name) PHN_vs_partial_name_results = db_interaction.PHN_vs_partial_name_query(db, PHN_identifier(per_day_num[2],compiled_PHN_pat), per_day_num[0], table_name) DOB_vs_partial_name_results = db_interaction.DOB_vs_partial_name_query(db, found_datetimes, per_day_num[0], table_name) #This patient prediction is the variable which should be used to determine where the sample gets filed ################################################################################################################################################## THERES A NONE HERE TO REPRESENT POSSIBLE BOTTOM UP MATCHES patient_prediction_result = patient_hypothesis((PHN_vs_DOB_vs_partial_name_results,PHN_vs_DOB_results,PHN_vs_partial_name_results,DOB_vs_partial_name_results,None)) fp.write("\nPatient Hypothesis: " + str(patient_prediction_result)+" for {}".format(str(pdf_path))) fp.write("\nA: Matches crossreferencing the PHN vs DOB vs partial found names\n" + str(PHN_vs_DOB_vs_partial_name_results)) fp.write("\nB: Matches crossreferencing the PHN vs DOB:\n" + str(PHN_vs_DOB_results)) fp.write("\nC: Matches crossreferencing the PHN vs partial found names:\n" + str(PHN_vs_partial_name_results)) fp.write("\nD: Matches crossreferencing the DOB vs partial found names:\n" + str(DOB_vs_partial_name_results)) fp.write("\n Matches found using only DOB: " + str(db_interaction.DOB_query(db,found_datetimes,table_name))) fp.write("\n\n\n TEXT EXTRACTED: " + text) fp.close() return patient_prediction_result
import pytz import date_extractor text = "need to get two signatures." dates = date_extractor.extract_dates(text) print(dates)
## Define regex for cusip regex = "[0-9]{5}[a-z]{3}[0-9]{1}" flag = 0 flag_issue = 0 date_list = [] for i in range(pdfReader.numPages): pageObj = pdfReader.getPage(i) text = pageObj.extractText() ls = text.split("\n") #print(ls) for word in ls: if (flag == 0): date = extract_dates(word) #if(len(date)>0): # print("Doc date: ",date) doc = nlp(word) for j in doc.ents: if j.label_ == "DATE": print("DOC DATE:", j) date_list.append(j) flag = 1 #date_ner.append(j) break ls = listtoLower(ls) for index in range(len(ls)): if ("issue date" in ls[index]): #print(ls[index])