def get_date(pdfdata):

	text = parse_layout(pdfdata)
	pdflines = iter(text.splitlines())
	date = []
	for line in pdflines:
		try:
			extracted = list(datefinder.find_dates(line))
		except:
			continue
		if extracted:
			date.extend(extracted)
			
	return date
def get_est_date(self, start_d, end_d = datetime.today(), 
                 n_first=10, n_last=10, strictness=True):
    # tokenize text by sentence
    sentences = sent_tokenize(self)
    # check to see if there's a date in the first n lines
    selection = ' '.join(sentences[0:n_first])
    dates = datefinder.find_dates(selection, strict=strictness)
    # convert datefinder object to a list of dates
    dates = [d.replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None) for d in dates]
    # verify dates are legitimate
    legit_dates = [d for d in dates if check_date(d, start_d, end_d) == True]
    # if any legitimate date exists in the first n lines, return the first one
    if len(legit_dates) > 0:
        return legit_dates[0]
    # if not repeat the process with the last n lines in the document
    slice = (n_last * -1) - 1
    selection = ' '.join(sentences[slice:-1])
    dates = datefinder.find_dates(selection, strict=strictness)
    # convert datefinder object to a list of dates
    dates = [d.replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=None) for d in dates]
    # verify dates are legitimate
    legit_dates = [d for d in dates if check_date(d, start_d, end_d) == True]
    # if any legitimate date exists in the first n lines, return the first one
    if len(legit_dates) > 0:
        # if there is, return the first date from the first ten sentences
        return legit_dates[0]
    else: 
        return None
def test_find_date_strings(input_text, expected_date):
    if isinstance(expected_date,list):
        matches = list(datefinder.find_dates(input_text))
        assert matches == expected_date
    else:
        return_date = None
        for return_date in datefinder.find_dates(input_text):
            assert return_date == expected_date
        assert return_date is not None # handles dates that were never matched
def getvec(lines):
    '''
    features:
        number of streets(0), cities(1), states(2), countries(3): INT
        sum of weights of the streets: FLOAT (4)
        has phone number?: 0/1 (5)
        zip codes?: 0/1 (6)
        length of paragraph: INT (7)
        has date?: 0/1 (8)

    This method calculates the feature vector for a single paragraph using the above
    features

    Parameters
    ----------
    data : A list of paragraphs which forms a synthetic blog

    Returns
    -------
    vec : A list length 8

    '''
    vec = [0]*9
    for line in lines:
        phnum = len(rephone.findall(line))
        nums = len(renum.findall(line))
        numterm = 0

        for terms in st.tokenize(line):
            numterm+=1
            # terms = terms.lower()
            if terms.lower() in streets:
                vec[0] += 1
                vec[4] += streets[terms.lower()]/float(summ)

            if terms in states:
                # state names are biased towards US and Australia addresses
                # therefore we don't add their weights
                vec[1] += 1

            if terms in cities:
                vec[2] += 1

            if terms in countries:
                vec[3] += 1

        vec[5] = phnum
        vec[6] = nums
        vec[7] = numterm

        matches = datefinder.find_dates(line, strict=True)
        try:
            for match in matches:
                vec[8] = 1
                break
        except:
            pass
    return vec
Beispiel #5
0
def date_extract(input_string):
    """ exracts date from a given input string """
    matches = list(datefinder.find_dates(input_string))
    date = {}
    date["year"] = matches[0].year
    date["day"]  = matches[0].day
    date["minutes"]  = matches[0].minute
    date["hour"]  = matches[0].hour
    date["month"] = matches[0].month
    date["second"] = matches[0].second
    return date
        
Beispiel #6
0
def check_if_date_present(text):
    ''' Check if a date/schedule is present in the text.
    :param text:
    :return: True if date present, else False
    '''
    timex_tag = tag(text)
    if re.search('<TIMEX2>',timex_tag):
        return True
    matches = datefinder.find_dates(text)
    for match in matches:
        if match:
            return True
    return False
Beispiel #7
0
def check(url):
    response = urllib.urlopen(url)
    doc = response.read()
    dates_list = datefinder.find_dates(doc)
    days_ago = [(datetime.datetime.today() - date).days for date in dates_list]

    if (days_ago == []):
        return {'status':'No Dates','code':-1,'last_entity':0}
    else:
        last_date_validated = min([i for i in days_ago if i > 0])
        if last_date_validated > 100:
            return {'status':'Not Maintained','code':0,'last_entity':last_date_validated}
        else:
            return {'status':'Maintained','code':1,'last_entity':last_date_validated}
    def __get_events(logs_with_id):
        log_index, line = logs_with_id
        line = line.lower()

        # GET month
        matches = datefinder.find_dates(line, source=True)
        months = []
        for match in matches:
            month = sub('[^a-zA-Z]', '', match[1])
            if month:
                months.append(month)

        # only leave alphabet, maintain word split
        line = line.split()
        line_split = []
        for li in line:
            alphabet_only = sub('[^a-zA-Z]', '', li)
            line_split.append(alphabet_only)

        # GET preprocessed_event_countgroup
        # remove more than one space
        line = ' '.join(line_split)
        line = ' '.join(line.split())
        preprocessed_event_countgroup = line

        # GET preprocessed_events
        # remove word with length only 1 character
        for index, word in enumerate(line_split):
            if len(word) == 1:
                line_split[index] = ''

        # remove more than one space
        line = ' '.join(line_split)
        line = ' '.join(line.split())

        # remove stopwords
        stopwords = corpus.stopwords.words('english')
        stopwords_month = stopwords
        if months:
            stopwords_month.extend(months)

        stopwords_result = [word for word in line.split() if word not in stopwords_month]
        preprocessed_events = ' '.join(stopwords_result)
        preprocessed_events_graphedge = preprocessed_events

        preprocessed_with_id = (log_index, preprocessed_events, preprocessed_event_countgroup,
                                preprocessed_events_graphedge)
        return preprocessed_with_id
Beispiel #9
0
def hasdate(address):
    '''
    Function for removing dates from addresses. Dates were still coming in the addresses,
    so I deecided to manually filter them out

    Parameters
    ----------
    address :

    predictors : a list of tuples which are like (parameters, model)
        Here parameters is a dictionary of the hyper-parameters of the model

    Returns
    -------
    final : A list of lists, where every list contains the index of paragraph which are the
        part of the same address.
    '''

    str1 = " ".join(address)
    matches = datefinder.find_dates(str1, strict=True)
    for match in matches:
        return True
    return False
Beispiel #10
0
def test_find_date_strings(input_text, expected_date):
    for return_date in datefinder.find_dates(input_text):
        assert return_date == expected_date
Beispiel #11
0
 def find_date(self, row):
     if list(datefinder.find_dates(row, strict=True)):
         return list(datefinder.find_dates(row, strict=True))[0].strftime('%-m/%-d/%Y')
     else:
         return None
import datefinder
matches=datefinder.find_dates("obama was born on January 4th,2017 at 8:00.He got married on 12 june 2008")
for match in matches:
 print(match)
    def write_state(self, file, course):
        course_info = self.check_course(course).get()
        if not course_info:
            raise MyException('Не найден курс с таким номером.')

        text_error = ""

        wb = openpyxl.load_workbook(io.BytesIO(file))
        groups = {format_group(i): i for i in wb.get_sheet_names()}
        self.sql = f"""
            SELECT name FROM "group-cource_rels", groups WHERE cource_id = {course} AND group_id=id
        """
        for name in {i for (i, ) in self.all()} & set(groups.keys()):
            sheet = wb.get_sheet_by_name(groups[name])
            first_data, last_data = {}, {}
            dates = []

            for row in sheet.values:
                if not dates:
                    dates = list(filter(bool, row[3:]))
                    continue

                key = tuple(row[:3])
                first_data[key] = tuple(row[3:])
            group_data = Student().create_group(name)
            self.sql = f"""
                SELECT
                id,
                concat(last_name, first_name, patronymic) as fio,
                gradebook_identy
                FROM students 
                WHERE group_id = {group_data[0]}
            """
            students = self.all()
            for stud in first_data.keys():
                check_id = lambda x: x[0] == stud[0]
                check_number = lambda x: x[2] == str(stud[2]).replace(' ', '')
                check_name = lambda x: x[1].lower() == str(stud[1]).replace(
                    ' ', '').lower()

                be = None
                if stud[0]:
                    be = [i for i in students if check_id(i)]
                elif stud[2]:
                    be = [i for i in students if check_number(i)]
                elif stud[1]:
                    be = [i for i in students if check_name(i)]

                if be and len(be) == 1:
                    new_key = list(stud)
                    new_key[0] = be[0][0]
                    last_data[tuple(new_key)] = first_data[stud]
                elif not be and stud[1] and stud[2]:
                    r = compile(r'^[ \n]*(?P<fio>[\w]+ [\w]+ [\w]+)[ \n]*$')
                    fio = r.search(stud[1])
                    if fio:
                        fio = fio.group('fio').split(' ')
                        student = {
                            'last_name': fio[0],
                            'first_name': fio[1],
                            'patronymic': fio[2],
                            'id': None
                        }
                        try:
                            gradebook = int(stud[2])
                        except (ValueError, TypeError):
                            text_error += f'Номер зачетки должен быть числом. {stud[2]} \n'
                            continue
                        student.update({
                            'group_id': name,
                            'gradebook': gradebook,
                        })
                        new_key = list(stud)
                        new_key[0] = Student().create(student,
                                                      True).commit_return()[0]
                        last_data[tuple(new_key)] = first_data[stud]
                else:
                    text_error += f"Не удалось однозначно определить студента {stud}) \n"

            if not last_data:
                continue

            self.sql = f"""
                SELECT id, date_time
                FROM lessons
                WHERE 
                    cource_id = {course} AND 
                    group_id = {group_data[0]}
            """
            all_lesson = self.all()

            last_date = {}
            for lesson in dates:
                date_lesson = list(datefinder.find_dates(lesson))

                if not date_lesson:
                    last_date[lesson] = None
                    continue

                check = [
                    i for i in all_lesson
                    if i[1] - timedelta(minutes=20) <= date_lesson[0] <= i[1] +
                    timedelta(minutes=20)
                ]

                print(check)
                if check:
                    last_date[lesson] = check[0][0]
                else:
                    last_date[lesson] = self.create_lesson(
                        course, group_data[0], lesson).commit_return()[0]
                    self.sql = f""" 
                        INSERT INTO student_visits(student_id, lesson_id, visited) 
                        VALUES {','.join(f'({i[0]}, {last_date[lesson]}, false)' for i in students)}
                        ON CONFLICT DO NOTHING
                    """
                    self.commit()

            for stud, values in last_data.items():
                for index, val in enumerate(values):

                    if not val or not last_date[dates[index]]:
                        continue

                    if isinstance(val, int):
                        self.sql = f"""
                            INSERT INTO student_performance(student_id, lesson_id, points)
                            VALUES ({stud[0]}, {last_date[dates[index]]}, {val})
                            ON CONFLICT (student_id, lesson_id) DO UPDATE SET points = EXCLUDED.points
                        """
                        self.commit()

                    self.sql = f"""
                        INSERT INTO student_visits(student_id, lesson_id, visited)
                        VALUES ({stud[0]}, {last_date[dates[index]]}, {val != '-'})
                        ON CONFLICT (student_id, lesson_id) DO UPDATE SET visited = EXCLUDED.visited
                    """
                    self.commit()

        return text_error
Beispiel #14
0
    def get_svo(self, sentence):
        '''
        get SVO of single sentence
        '''
        parsed_phrase = self.nlp(sentence)
        names = list(parsed_phrase.ents)
        corrected_names = []
        persons = []
        locations = []
        organizations = []
        event_date = []
        norp = []
        facilities = []
        events = []
        for e in names:
            linked = e.text
            if any(map(str.isupper, linked)) and any(
                            ext in linked for ext in ['January', 'February', 'March', 'April', 'May',
                                                      'June', 'July', 'August', 'September', 'October', 'November',
                                                      'December', 'Sunday', 'Monday',
                                                      'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'week',
                                                      'Yesterday', 'month', 'day', 'Today']) == False:
                corrected_names.append(linked)
            if e.label_ == 'GPE' or e.label == 'LOC':
                locations.append(e.text)
            if e.label_ == 'PERSON':
                persons.append(e.text)
            if e.label_ == 'ORG':
                organizations.append(e.text)
            if e.label == 'NORP':
                norp.append(e.text)
            if e.label == 'FACILITY' or e.label == 'PRODUCT':
                facilities.append(e.text)
            if e.label == 'EVENT':
                events.append(e.text)

        subjects = []
        objects = []
        verbs = []
        for text in parsed_phrase:
            if text.dep_.startswith("nsubj") or text.dep_ in ['conj']:
                subject = text.orth_
                subjects.append(subject)
            if text.dep_ in ["dobj", 'pobj']:
                object_ = text.orth_
                objects.append(object_)
            if text.pos_ == "VERB":
                verb = text.orth_
                verbs.append(verb)

        # event date
        try:
            event_date = list(set(sentence.replace('.', '').split()) & set(['Monday', 'Tuesday', 'Wednesday', 'Tursday',
                                                                            'Friday', 'Saturday', 'Sunday', 'Today',
                                                                            'today',
                                                                            'Tomorrow', 'tomorrow', 'Yesterday',
                                                                            'yesterday']))[0]

        except:
            try:
                event_date = list(datefinder.find_dates(sentence))[0]
                if str(event_date.year) not in sentence:
                    event_date = str(event_date.month) + '/' + str(event_date.day)
                event_date = str(event_date)
            except:
                event_date = None

        return {'Sentence': sentence,
                'Subjects': subjects,
                'Predicates': verbs,
                'Objects': objects,
                'Names': corrected_names,
                'Event_date': event_date,
                'Persons': persons,
                'Locations': locations,
                'Organizations': organizations,
                'NORP': norp,
                'Facilities': facilities,
                'Events': events}
Beispiel #15
0
def volunteer_slot(service):
    clinician_email = service.calendarList().get(
        calendarId='primary').execute()['id']
    print("Please enter your volunteer slots Date and Start-Time.")
    input_time = input(
        colored("Day Month Time - e.g: 14 Dec 14:30]: ", 'yellow'))
    summary = input("Please enter the slot summary: ")
    description = input("Please enter the slot description: ")

    increment_time = 30

    start_date_time = list(datefinder.find_dates(input_time))[0]
    start_date_time_str = start_date_time.strftime("%Y-%m-%dT%H:%M:%S+02:00")

    end_date_time = start_date_time + timedelta(minutes=increment_time)
    end_date_time_str = end_date_time.strftime("%Y-%m-%dT%H:%M:%S+02:00")

    google_meet_id = uuid4().hex

    for i in tqdm(range(3)):
        event = {
            'summary': summary,
            'description': description,
            'start': {
                'dateTime': start_date_time_str,
                'timeZone': 'Africa/Johannesburg',
            },
            'end': {
                'dateTime': end_date_time_str,
                'timeZone': 'Africa/Johannesburg',
            },
            'attendees': [{
                'email': clinician_email,
                'responseStatus': 'accepted'
            }],
            'reminders': {
                'useDefault':
                False,
                'overrides': [
                    {
                        'method': 'email',
                        'minutes': 24 * 60
                    },
                    {
                        'method': 'popup',
                        'minutes': 10
                    },
                ],
            },
            'conferenceData': {
                "createRequest": {
                    "requestId": google_meet_id,
                    "conferenceSolutionKey": {
                        "type": "hangoutsMeet"
                    }
                }
            },
        }
        event = service.events().insert(calendarId=CLINIC_CALENDAR_ID,
                                        body=event,
                                        conferenceDataVersion=1).execute()
        # print('Event created: {0}'.format(event.get('htmlLink')))
        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(event)
        start_date_time = start_date_time + timedelta(minutes=increment_time)
        start_date_time_str = start_date_time.strftime(
            "%Y-%m-%dT%H:%M:%S+02:00")
        end_date_time = end_date_time + timedelta(minutes=increment_time)
        end_date_time_str = end_date_time.strftime("%Y-%m-%dT%H:%M:%S+02:00")
    print(
        colored('Your volunteer slot for ', 'green') +
        colored(summary, 'yellow') +
        colored(' was successfully created.', 'green'))
def extract_and_write_posts(elements, filename):
    import time as tt
    start = tt.time()
    print(start,len(elements))
    try:
        f = open(filename, "w", newline="\r\n")
        f.writelines(
            "TIME||TYPE||TITLE||STATUS||LINKS"
            + "\n"
            + "\n"
        )

        for x in elements:
            try:
                title = " "
                status = " "
                link = ""

                # time
                # time = x.find_all('abbr')[0]['title']
                # url = x.find_element_by_xpath('//a[contains(@href,"href")]')
                # # title
                # title = utils.get_title_bs(x, selectors)
                # if title.text.find("shared a memory") != -1:
                #     x = x.find_all('div',attrs={'class':'_1dwg _1w_m'})
                #     title = utils.get_title_bs(x, selectors)

                # status = utils.get_status_bs(x, selectors)
                # if (
                #     title.text
                #     == driver.find_element_by_id(selectors.get("title_text")).text
                # ):
                #     if status == "":
                #         temp = utils.get_div_links_bs(x, "img", selectors)
                #         if (
                #             temp == ""
                #         ):  # no image tag which means . it is not a life event
                #             link = utils.get_div_links_bs(x, "a", selectors)[
                #                 "href"
                #             ]
                #             type = "status update without text"
                #         else:
                #             type = "life event"
                #             link = utils.get_div_links_bs(x, "a", selectors)[
                #                 "href"
                #             ]
                #             status = utils.get_div_links_bs(x, "a", selectors).text
                #     else:
                #         type = "status update"
                #         if utils.get_div_links_bs(x, "a", selectors) != "":
                #             link = utils.get_div_links_bs(x, "a", selectors)[
                #                 "href"
                #             ]

                # elif title.text.find(" shared ") != -1:

                #     x1, link = utils.get_title_links_bs(title)
                #     type = "shared " + x1

                # # elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
                # #     if title.text.find(" at ") != -1:
                # #         x1, link = utils.get_title_links(title)
                # #         type = "check in"
                # #     elif title.text.find(" in ") != 1:
                # #         status = utils.get_div_links(x, "a", selectors).text

                # # elif (
                # #     title.text.find(" added ") != -1 and title.text.find("photo") != -1
                # # ):
                # #     type = "added photo"
                # #     link = utils.get_div_links(x, "a", selectors).get_attribute("href")

                # # elif (
                # #     title.text.find(" added ") != -1 and title.text.find("video") != -1
                # # ):
                # #     type = "added video"
                # #     link = utils.get_div_links(x, "a", selectors).get_attribute("href")

                # else:
                #     type = "others"

                # if not isinstance(title, str):
                #     title = title.text

                # status = status.replace("\n", " ")
                # title = title.replace("\n", " ")
                linkdata = x.find_all('a', href=True, role='link')
                timedata = x.find_all('a',role="link",tabindex="0")
                for i in range(len(timedata)):
                    try:
                        tryts = timedata[i]['aria-label']
                        if len(list(datefinder.find_dates(tryts))) != 0:
                            time = list(datefinder.find_dates(tryts))[0].strftime("%m/%d/%Y")
               
                            break
                    except:
                        pass
                if len(elements) <= 1:
                    time = datetime.now().strftime("%m/%d/%Y")
                for sub in linkdata:
                    try:
                        link = sub['href']
                        if ids in link or 'https://www.facebook.com/' in link:
                            link = ""
                        elif link != '#':
            #                 print(link)
                            break
                    except:
                        pass
                line = (
                    time
                    + " || "
                    + ' '
                    + " || "
                    + ' '
                    + " || "
                    + ' '
                    + " || "
                    + str(link)
                    + "\n"
                )

                try:
                    f.writelines(line)
                except Exception:
                    print("Posts: Could not map encoded characters")
            except Exception:
                pass
        f.close()
        print(tt.time() - start)
    except Exception:
        print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0])

    return
# string_with_dates = '''
# MSDS #: RQ1007792                                                                                             Issue Date:  09/2010 Supersedes: RQ0900706                                                                                          Issue Date:  01/2009

# '''

text_file = open(
    "D:/Dropbox/_ICF_project/WA 2-75/HD/Delivery/Extract_summary/date_raw.txt",
    "r")
lines = text_file.readlines()

lines_post = []
for k in lines:
    # print "k===", k
    try:
        matches = datefinder.find_dates(k)
        # for match in matches:
        # 	print match
        matches_str = " || ".join([str(z).replace('\n', "") for z in matches])
        # print matches_str
        lines_post.append(matches_str)
    except:
        lines_post.append("N/A")

# print len(lines_post)
# print lines_post

with open(
        "D:/Dropbox/_ICF_project/WA 2-75/HD/Delivery/Extract_summary/date_raw2.txt",
        'w') as f:
    for s in lines_post:
Beispiel #18
0
    def parseCase(self, response):
        try:
            petitioner = response.css('h2 ::text').extract()[2].split(',')[0]
        except:
            petitioner = ""
        try:
            respondent = response.css('h2 ::text').extract()[0].split(',')[0]
        except:
            respondent = ""
        try:
            judgement_text = " ".join(response.css('pre ::text').extract())
        except:
            judgement_text = " ".join(
                response.css('.col-sm-9 ::text').extract())
        sentences = judgement_text.split('\n')
        source = 'California Court of Appeal'
        matches = list(
            datefinder.find_dates(
                response.css(
                    'h3+ .bottom .meta-data-value ::text').extract_first()))
        date = matches[-1].day
        month = matches[-1].month
        year = matches[-1].year
        paras = judgement_text.split('\n\n')
        last_paras = ' '.join(paras[-15:])
        if 'affirmed' in last_paras:
            judgement = 'dismissed'
        elif 'denied' in last_paras:
            judgement = 'dismissed'
        elif 'dismissed' in last_paras:
            judgement = 'dismissed'
        elif 'reversed' in last_paras:
            judgement = 'allowed'
        else:
            last_paras = ' '.join(paras[-20:])
            if 'affirmed' in last_paras:
                judgement = 'dismissed'
            elif 'denied' in last_paras:
                judgement = 'dismissed'
            elif 'dismissed' in last_paras:
                judgement = 'dismissed'
            elif 'reversed' in last_paras:
                judgement = 'allowed'
            else:
                last_paras = ' '.join(paras[-35:])
                if 'affirmed' in last_paras:
                    judgement = 'dismissed'
                elif 'denied' in last_paras:
                    judgement = 'dismissed'
                elif 'dismissed' in last_paras:
                    judgement = 'dismissed'
                elif 'reversed' in last_paras:
                    judgement = 'allowed'
                else:
                    judgement = 'tied / unclear'

        bench_sub = ', J\\.'
        bench_sentence = [x for x in sentences if re.search(bench_sub, x)]
        bench_sub = 'P\\. J\\.'
        bench_sentence += [x for x in sentences if re.search(bench_sub, x)]
        bench_sub = 'Judge:'
        bench_sentence += [x for x in sentences if re.search(bench_sub, x)]
        bench_sentence = [sub.replace("P. J.", '') for sub in bench_sentence]
        bench_sentence = [sub.replace(", J.", '') for sub in bench_sentence]
        bench_sentence = [sub.replace("Trial", '') for sub in bench_sentence]
        bench_sentence = [sub.replace("Acting", '') for sub in bench_sentence]
        bench_sentence = [sub.replace("ACTING", '') for sub in bench_sentence]
        bench_sentence = [sub.replace("Judge", '') for sub in bench_sentence]
        bench = [
            sub.translate(str.maketrans('', '', string.punctuation)).strip()
            for sub in bench_sentence
        ]
        bench = list(dict.fromkeys(bench))
        bench = ", ".join(bench)

        appellant_sub = 'for Defendant and Appellant'
        appellant_sentence = [
            x for x in sentences if re.search(appellant_sub, x)
        ]
        appellant_sub = 'for Defendant/Appellant'
        appellant_sentence += [
            x for x in sentences if re.search(appellant_sub, x)
        ]
        appellant_sub = 'for\nDefendant'
        appellant_sentence += [
            x for x in sentences if re.search(appellant_sub, x)
        ]
        appellant_sub = 'for Petitioner'
        appellant_sentence += [
            x for x in sentences if re.search(appellant_sub, x)
        ]
        appellant_sub = 'for Defendant'
        appellant_sentence += [
            x for x in sentences if re.search(appellant_sub, x)
        ]
        appellant_sentence = [
            sub.replace("for", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("Counsel", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("Appellant", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("and", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("Petitioner", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("Defendant", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("Respondent", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("Appeal", '') for sub in appellant_sentence
        ]
        appellant_sentence = [
            sub.replace("under appointment by the Court of", '')
            for sub in appellant_sentence
        ]
        petitioner_counsel = [
            sub.translate(str.maketrans('', '', string.punctuation)).strip()
            for sub in appellant_sentence
        ]
        petitioner_counsel = list(dict.fromkeys(petitioner_counsel))

        respondent_sub = 'for Plaintiff and Respondent'
        respondent_sentence = [
            x for x in sentences if re.search(respondent_sub, x)
        ]
        respondent_sub = 'for Respondent'
        respondent_sentence += [
            x for x in sentences if re.search(respondent_sub, x)
        ]
        respondent_sub = 'for Plaintiff'
        respondent_sentence += [
            x for x in sentences if re.search(respondent_sub, x)
        ]
        respondent_sub = 'for\nPlaintiff'
        respondent_sentence += [
            x for x in sentences if re.search(respondent_sub, x)
        ]
        respondent_sub = 'for Plaintiff/Respondent'
        respondent_sentence += [
            x for x in sentences if re.search(respondent_sub, x)
        ]
        respondent_sentence = [
            sub.replace("for", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("Counsel", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("Respondent", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("and", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("Plaintiff", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("petitioner", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("Defendant", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("Appellant", '') for sub in respondent_sentence
        ]
        respondent_sentence = [
            sub.replace("under appointment by the Court of", '')
            for sub in respondent_sentence
        ]
        respondent_counsel = [
            sub.translate(str.maketrans('', '', string.punctuation)).strip()
            for sub in respondent_sentence
        ]
        respondent_counsel = list(dict.fromkeys(respondent_counsel))

        items['source'] = source
        items['url'] = response.request.url
        items['petitioner'] = petitioner
        items['respondent'] = respondent
        items['date'] = date
        items['month'] = month
        items['year'] = year
        items['bench'] = bench
        items['judgement'] = judgement
        items['judgement_text'] = judgement_text
        items['petitioner_counsel'] = petitioner_counsel
        items['respondent_counsel'] = respondent_counsel
        items['title'] = respondent + ' v. ' + petitioner
        print("...")
        yield (items)
Beispiel #19
0
from sortFile import SortFile

tesseract: Tesseract = Tesseract()

imageToText: ImageToText = TessaractAdapter(tesseract)

allFiles = os.listdir("images")

fileList = []

for file in allFiles:

    try:
        txt = imageToText.toText(file)

        dates = datefinder.find_dates(txt)

        dateSet = set(dates)

        for d in dateSet:

            formatedDate = d.strftime("%Y-%m-%d")
            tempDict = {'fileName': file, 'date': formatedDate}

            fileList.append(tempDict)

    except:

        print("error")

sortFile = SortFile()
Beispiel #20
0
 def date_finder_add(self, text):
     dates = df.find_dates(text)
     list_date = []
     for date in dates:
         list_date.append(datetime.date(date.year, date.month, date.day))
     return list_date
Beispiel #21
0
org_report_list = [
    file for file in os.listdir(org_report_folder) if file.endswith('.docx')
]

report_index = []
for r_i, report_name in enumerate(org_report_list):
    f = open(os.path.join(org_report_folder, report_name), 'rb')
    document = Document(f)
    f.close()
    para_first = document.paragraphs[0]
    if len(para_first.text.split()) < 2:
        para_first.text = 'XXXX'

    for p_i, para in enumerate(document.paragraphs):
        matches = list(
            datefinder.find_dates(para.text, source=True, index=True))
        if len(matches) > 0:
            for date_t in matches:
                match_date = re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', date_t[1])
                if match_date:
                    document.paragraphs[p_i].text = document.paragraphs[
                        p_i].text.replace(date_t[1], 'XX/XX/XXXX')
                    print(date_t[1] + ' in ' + report_name)
        if 'Dictated By:' in para.text:
            para.text = 'Dictated by: XXXX'

    # print(document)
    document.save(os.path.join(output_report_folder, '%05d' % r_i + '.docx'))

    report_index.append('%05d' % r_i + '.docx;' + report_name + '\n')
                for j in range(len(response['relations'][i]['arguments'])):
                    if response['relations'][i]['arguments'][j]['entities'][0]['type'] == "Person":
                        string_to_add = response['relations'][i]['arguments'][j]['entities'][0]['text']
                        if string_to_add == "j\u2019":
                            string_to_add = tweet._json['entities']['user_mentions'][0]['screen_name']
                        person += " " + string_to_add
                    elif response['relations'][i]['arguments'][j]['entities'][0]['type'] == "Location" or \
                                    response['relations'][i]['arguments'][j]['entities'][0][
                                        'type'] == "GeopoliticalEntity":
                        location = " " + response['relations'][i]['arguments'][j]['entities'][0]['text']
        except:
            continue

        try:
            date = datetime.datetime.min
            date_list = datefinder.find_dates(tweet.text.split('https')[0])
            for match in date_list:
                date = match
                break

        except ValueError:
            date = datetime.datetime.min
        if date > datetime.datetime.now() and int(date.year) < 2021:
            if date != "" or person != "" or location != "":
                hashtags = ""
                for elem in tweet._json['entities']['hashtags']:
                    hashtags += " " + elem['text']
                entry = {'tweet': tweet._json, 'date': str(date), 'location': location, 'Personnes': person,
                         'Hashtags': hashtags, 'text': tweet.text}
                conferencejson['conference'].append(entry)
                print(entry)
def extractevent():

    event_types = [
        "coffee night", "job fairs", "career fairs", "career fair",
        "tech talk", "alumni connection", "lecture", "Birthday", "Meeting",
        "Seminar", "Party", "Anniversary", "Marriage", "Appointment", "Meet",
        "sports", "career fair", "Workshop"
    ]
    event_dates = [
        "Tomorrow", "Today", "Day After Tomorrow", "Next Month", "Next Week"
    ]

    csvFile = pd.read_csv('CSV_NAME.csv')
    messages = csvFile['Message_body']
    senders = csvFile['Sender']
    subjects = csvFile['Subject']
    empevents = []
    empdates = []
    event_list = []

    msg_counter = 0
    for index, row in messages.iteritems():
        i = 0
        event_dict = {}
        str1 = ''.join(row)
        str1 = str1.replace("-", " ")
        str1 = str1.replace("|", " ")
        for event_type in event_types:
            if event_type.lower() in str1.lower():
                flag = False
                for event_date in event_dates:
                    if event_date.lower() in str1.lower():
                        convertedDate = rawday2date.getDate(event_date.lower())
                        json_event_date = convertedDate.date()
                        matches = datefinder.find_dates(str1)
                        for match in matches:
                            if match.time():
                                if i == 0:
                                    json_event_startTime = match.time()
                                elif i == 1:
                                    json_event_endTime = match.time()
                            i += 1
                        flag = True
                        break
                if not flag:
                    matches = datefinder.find_dates(str1)

                    for match in matches:
                        if i == 0:
                            json_event_date = match.date()
                            json_event_startTime = match.time()
                        elif i == 1:
                            json_event_endTime = match.time()
                        i += 1

                json_event_type = event_type
                json_event_sender = senders.iloc[msg_counter]
                json_event_subject = subjects.iloc[msg_counter]
                if i == 0:
                    json_event_startTime = "00:00:00"
                    json_event_endTime = "00:00:00"
                elif i == 1:
                    json_event_endTime = "00:00:00"
                event_dict["type"] = json_event_type
                event_dict["date"] = str(json_event_date)
                event_dict["stime"] = str(json_event_startTime)
                event_dict["etime"] = str(json_event_endTime)
                event_dict["title"] = json_event_sender
                event_dict["desc"] = json_event_subject

                event_list.append(event_dict)
                break
        msg_counter += 1

    for empdate in empdates:
        print(empdate.date())
    for event in event_list:
        print(event)
    return event_list
        #"01-03 11:16:21",
        
        #"8月15日 22:46",
        #"01-03 11:16",
    
        #"7/3",
        #"5月11日",
    
    
        
        
        #"3 秒前",
        #"29 分钟前",
        #"2 小时前",
        #"2天前",
        
        #"今天 15:42:21",
        #"昨天 15:42:21",
        #"前天 10:41:21",
        #"今天 15:42",
        #"昨天 15:42",
        #"前天 10:41",
        
        #]"""
# 识别不准确
#"昨天 15:42", 识别为今天。。。 可能就没这样的判断
date_generate = datefinder.find_dates(text)
#
for date in date_generate:
    print(date)
Beispiel #25
0
    def parse(self, response):
        locale.setlocale(locale.LC_ALL, '')
        result_queue = getattr(self, 'result_queue', None)
        if response.status == 404:
            result_queue[0] = 404
            self.log('result from parse: {}'.format(result_queue[0]))
            raise CloseSpider('Такого логина нет')
        next_page = response.css('li.next a::attr(href)').get()
        post_previews = response.css('div.post-item')
        self.postsCount += len(post_previews)
        for post_preview in post_previews:
            preview = PostPreview()
            preview['id'] = post_preview.css(
                'div.post-item::attr(id)').get().split('-')[1]
            preview['title'] = unquote(
                post_preview.css(
                    'div.post-item__header a.post-item__title-link::text').get(
                    )).strip('\n').strip()
            preview['link'] = unquote(
                post_preview.css(
                    'div.post-item__header a.post-item__title-link::attr(href)'
                ).get())
            likesCountTmp = post_preview.css(
                'span.post-item__counter span.ygls-likes-count::text').get()
            preview[
                'likesCount'] = likesCountTmp is not None and likesCountTmp if likesCountTmp else 0
            commentsCountTmp = post_preview.css(
                'div.post-item__footer  span  a.gray::text').getall()
            #self.log( len(commentsCountTmp))
            if (len(commentsCountTmp) > 1):
                preview['commentsCount'] = commentsCountTmp[1].strip(
                    '\n').strip()
            else:
                preview['commentsCount'] = 0
            preview['views'] = post_preview.css(
                'span.post-item__counter span.post-views::text').get()
            date = post_preview.css('div.post-item__info::text').getall()[1]
            matches = datefinder.find_dates(date.strip())
            res = re.search(r'-(.+\d\s)', date)
            if (res and res.group(1).find('сегодня') == -1
                    and res.group(1).find('вчера') == -1):
                strr = res.group(1)
                #self.log(strr)
                preview['creationDate'] = self.parse_date(
                    strr
                )  #datetime.datetime.strptime(res.group(1).strip(), '%d %B %Y г., %H:%M').date()
            else:
                preview['creationDate'] = '-'

            self.postPreviews.append(preview)
            #self.log(preview['creationDate'])

        self.log(next_page)
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page,
                                 cookies=self.cookies,
                                 callback=self.parse)
        else:
            yield {
                'postsCount': self.postsCount,
                'postPreviews': self.postPreviews
            }
Beispiel #26
0

def trade_spider():
    info = ' '
    url = "http://www.eventsdoha.com/white-salsa-night-a-farewell-to-dubraska-the-irish-harp-sheraton-garnd-17th-may/"
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    for d in soup.find_all('p'):
        print d.text
        #info = str(div_tag.text)+str(div_tag.next_sibling)
        #print "hello"
        #info.append(d.text)
        info = info + d.text + "\n"
    details = {"information": info}
    print "***********"
    results2 = db.eventinfor.insert_one(details)
    results2.inserted_id
    # for link in soup.find_all('p', ):
    #   href = link.get('href')
    #  print(href)
    return info


if __name__ == "__main__":
    information = trade_spider()
    string_with_dates = information
    matches = datefinder.find_dates(string_with_dates)
    for match in matches:
        print match
Beispiel #27
0
def get_dates(document,
              om_df,
              ind,
              col_dict,
              print_info,
              infer_date_surrounding_rows=True):
    """Extract dates from the input document.

    This method is utilized within ``preprocessor.py``. For an easy way to extract dates, utilize the preprocessor and set
    extract_dates_only = True.

    Parameters

    ----------
    document : str
        String representation of a document
    om_df : DataFrame
        A pandas dataframe containing O&M data, which contains at least the columns within col_dict.
    ind : integer
        Designates the row of the dataframe which is currently being observed. This is required because if the
        current row does not have a valid date in the `eventstart`, then an iterative search is conducted
        by first starting at the nearest rows.
    col_dict: dict of {str : str}
        A dictionary that contains the column names relevant for the get_dates fn
        - **data** (*string*), should be assigned to associated column which stores the text logs
        - **eventstart** (*string*), should be assigned to associated column which stores the log submission datetime
    print_info : bool
        Flag indicating whether to print information about the preprocessing progress
    infer_date_surrounding_rows : bool
        If True, utilizes iterative search in dataframe to infer the datetime from surrounding rows if the current row's date value is nan
        If False, does not utilize the base datetime. Consequentially, today's date is used to replace the missing parts of the datetime.
        Recommendation: set True if you frequently publish documents and your dataframe is ordered chronologically

    Returns

    -------
    list
        List of dates found in text
    """

    DATA_COLUMN = col_dict["data"]
    EVENTSTART_COLUMN = col_dict["eventstart"]

    try:
        row = om_df.iloc[ind]
        if print_info:
            print("Start time: ", row[EVENTSTART_COLUMN])

        no_base_date_found = False
        if isinstance(row[EVENTSTART_COLUMN], float) and np.isnan(
                row[EVENTSTART_COLUMN]):
            # Was given a NaN value as event start date, so look before an after this row for a date

            if infer_date_surrounding_rows:
                no_base_date_found = True

            else:
                if print_info:
                    print("found nan")
                find_valid = False

                w = 1
                om_df_len = len(om_df.index)

                while find_valid is False and no_base_date_found is False:
                    ind_behind = ind - w
                    ind_ahead = ind + w

                    if ind_behind >= 0:
                        if print_info:
                            print("checking index: ", ind_behind)
                        row_behind = om_df.iloc[ind_behind]
                        if isinstance(row_behind[EVENTSTART_COLUMN],
                                      float) and np.isnan(
                                          row_behind[EVENTSTART_COLUMN]):
                            pass
                        else:
                            basedate = list(
                                datefinder.find_dates(
                                    row_behind[EVENTSTART_COLUMN]))[0]
                            find_valid = True
                            continue

                    if ind_ahead < om_df_len:
                        if print_info:
                            print("checking index: ", ind_ahead)
                        row_ahead = om_df.iloc[ind_ahead]
                        if isinstance(row_ahead[EVENTSTART_COLUMN],
                                      float) and np.isnan(
                                          row_ahead[EVENTSTART_COLUMN]):
                            pass
                        else:
                            basedate = list(
                                datefinder.find_dates(
                                    row_ahead[EVENTSTART_COLUMN]))[0]
                            find_valid = True
                            continue  # not needed but consistent syntax

                    if ind_ahead > om_df_len and ind_behind < 0:
                        no_base_date_found = True
                    w += 1

        else:
            basedate = list(datefinder.find_dates(row[EVENTSTART_COLUMN]))[0]

        if no_base_date_found:
            matches = list(datefinder.find_dates(document))
        else:
            matches = list(datefinder.find_dates(document, base_date=basedate))

    except Exception as e:
        matches = []
        if print_info:
            print(traceback.format_exc())
            print("\n")
            print("date")
            print(row[EVENTSTART_COLUMN])
            print("proc")
            print(document)
            print("raw")
            print(om_df.iloc[[ind]][DATA_COLUMN].tolist()[0])
            print(ind)
            print(e)
            print(traceback.format_exc())

    valid_matches = []
    # valid_inds = []
    for mtch in matches:
        try:
            if (mtch > datetime.strptime("01/01/1970", "%m/%d/%Y")) and (
                    mtch < datetime.now() + timedelta(days=365 * 100)):

                valid_matches.append(mtch)

        except Exception as e:
            if print_info:
                print(e)

    return valid_matches
def Exp(resume_path):
	resume_text = open(resume_path, 'r')
	ps = PorterStemmer()
	lem = WordNetLemmatizer()
	wordindex = []
	resume_lemm = []
	temp = []
	list_lemmatize = []
	for i in resume_text:
		resume_text = resume_text.replace("-", " t ")
	import re
	resume_regex = [re.sub('[^a-zA-Z0-9/]+', '', _) for _ in resume_text.split()]	

	for i in resume_regex:
		resume_lemm.append(ps.stem(i))

	list1 = ["Education" ,"Skills", "STRENGTH" , "Achievements",  "Contact", "Technical",  "Projects", "Address",  'Academic']

	for i in list1:
		list_lemmatize.append(ps.stem(i))
	 
	for st in list_lemmatize:	
		start = resume_lemm.index("experi")
		 
		if st in resume_lemm:
			end_temp = resume_lemm.index(st)
			if start < end_temp:
				end = resume_lemm.index(st)
				temp.append(resume_text[start : end])
				wordindex.append(resume_lemm.index(st))
			else:
				continue
	wordindex.sort()
	index = [i for i in wordindex  if i > start ]
	find_date1 = resume_lemm[start:index[0]]

	for st in find_date1:
		find_date1 = [str(date.today()) if st == "current" else st for st in find_date1]
		find_date1 = [str(date.today()) if st == "present" else st for st in find_date1]
		find_date1 = [str(date.today()) if st == "now" else st for st in find_date1]
		find_date1 = [str(date.today()) if st == "till date" else st for st in find_date1]
		find_date1 = ["jan" if st == "januari" else st for st in find_date1]
		find_date1 = ["feb" if st == "februari" else st for st in find_date1]
		find_date1 = ["july" if st == "juli" else st for st in find_date1]
		find_date1 = ["sep" if st == "septemb" else st for st in find_date1]
		find_date1 = ["oct" if st == "octob" else st for st in find_date1]
		find_date1 = ["nov" if st == "novemb" else st for st in find_date1] 
		find_date1 = ["dec" if st == "decemb" else st for st in find_date1]
	date_string = ' '.join(find_date1)		  	
	matches = datefinder.find_dates(str(date_string))
	dates_list = list(matches)
	
	for i in range(len(dates_list)):
	dates_list[i] = dates_list[i].date()
	
	j = 0
	total_days = 0
	if len(dates_list) == 1:
	dates_list.append(date.today())
	
	for i in dates_list:               
		total_days = total_days + (dates_list[j+1] - dates_list[j]).days
		j = j+2
		if j > (len(dates_list)-1):
			break	
	totalex = round((total_days/365),1)
	return totalex
 	resume_path.close()
Beispiel #29
0
def get_date(str_date):
    for g in datefinder.find_dates(str(str_date)):
        date = g
    date = f"{date.year} {date.strftime('%b')} {date.day}"
    return date
def date_conversion(dataset):
    return dataset['timestamp'].apply(lambda x: [
        pd.to_datetime(str(i).split(' ')[0])
        for i in datefinder.find_dates(str(x))
    ][0])
        }

        # create a connection with the database and query the data
        cursor = conn.cursor()
        cursor.execute("SELECT * FROM sorting_items ORDER BY position_order")
        rows = cursor.fetchall()
        print('Total Row(s):', cursor.rowcount)

        data = dict()
        years_dict = dict()

        i = 0
        j = 0
        for row in rows:
            #			print (row[2])
            matches = datefinder.find_dates(row[2])
            #			print (matches)

            for match in matches:
                #				print (match)
                datee = datetime.datetime.strptime(str(match),
                                                   "%Y-%m-%d %H:%M:%S")
                #				print (datee.month)
                #				data.update({str(row[1]):str(datee.month)})
                #				data[row[1]].append(datee.month)
                if datee.month == 1:
                    if not row[1] in d['1']:
                        d['1'].append(row[1])
                elif datee.month == 2:
                    if not row[1] in d['2']:
                        d['2'].append(row[1])
Beispiel #32
0
def extract_day(sentence):
    date = datefinder.find_dates(sentence)
    if len(date) >= 1:
        return date[0].strftime('%A')
    return datetime.datetime.now().strftime('%A')
Beispiel #33
0
 def format_date(self, date):
     matches = datefinder.find_dates(date)
     for match in matches:
         return (match.strftime("%Y-%m-%d"))
Beispiel #34
0
def scrape(request):
    #def scrape():
    p_start = 309518
    p_end = 309532
    session = requests.Session()
    session.max_redirects = 3
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    base_url = "https://www.hotnigerianjobs.com/hotjobs/"

    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    sample_url = os.path.join(BASE_DIR, 'techjobserver/sample')

    for page in range(0, 2):
        """
        present_url = base_url + str(page)
        print('\n scraping ' + present_url +'\n')
        time.sleep(20)
        page_req = session.get(present_url, verify=False)
        job_status = page_req.status_code
        print(job_status)
        content = page_req.content
        soup = BSoup(content, "html.parser")
        """
        i = 1
        soup = BSoup(open(sample_url + str(page) + '.html'))
        Jobs = soup.find('div', {"class": "middlecol"})
        job_case = Jobs.find_all('div', {'class': 'mycase'})[0]
        job_title = job_case.find('span', {'class': 'jobheader'})
        desc = job_case.find('div', {'class': 'mycase4'})
        desc = desc.find_all('div')[1]
        job_description = desc.find_all('div')
        job_intro = desc.find_all('p')[0]
        job_link = job_title.find('a')
        job_link = job_link['href']
        post_date = job_case.find('span', {'class': 'semibio'})
        matches = datefinder.find_dates(post_date.text)
        for match in matches:
            post_date = match.strftime("%A %b %d, %Y")
            print(post_date)
        try:
            raw_title = job_title.text.lower()
        except AttributeError:
            continue
        print(raw_title)

        title_list = (
            "Backend Developer", "Backend Engineer", "Business Analyst",
            "Business Intelligence Analyst", "Chief Information Officer",
            "CIO", "Cloud Architect", "Cloud Engineer", "Computer Analyst",
            "Computer Engineer", "Cyber Security Analyst", "Data Analyst",
            "Data Architect", "Data Entry", "Data Scientist", "Data Engineer",
            "Network Administrator", "Database Administrator", "DevOps",
            "DevOps Engineer", "Engineer", "Frontend Developer",
            "Frontend Engineer", "Fullstack Developer", "Fullstack Engineer",
            "Graphics Designer", "Hardware", "Information Security Analyst",
            "Information Security Consultant", "IT Director", "IT Manager",
            "IT Technician", "Mobile Developer", "Mobile App Developer",
            "Network Engineer", "Network Manager", "Network Technician",
            "Product Manager", "Programmer", "Project Manager",
            "Quality Assurance Tester", "QA Analyst",
            " Quality Assurance Engineer", "React Developer", "Sales Engineer",
            "Salesforce Administrator", "Site Reliability Engineer",
            "Software Quality Assurance Analyst", "Software Developers",
            "Software Engineer", "Software Support", "Software Tester",
            "System Administrator", "Systems Analyst", "Systems Engineer",
            "Technical Designer", "Technical Engineer", "Technical Lead",
            "Technical Product Manager", "Technical Project Manager",
            "Technical Sales", "Technical Support", "UI/UX", "UI/UX Designer")
        raw_list = (x.lower() for x in title_list)

        if any(word in raw_title for word in raw_list):
            print('\n scraping ' + job_link + '\n')
            #for child in soup.recursiveChildGenerator():
            job_description = str(job_description)
            job_description = html.escape(job_description)
            job_load = {
                "job_title": job_title.text,
                "job_description": job_description,
                "job_intro": job_intro.text,
                "job_date": post_date,
                "job_link": job_link
            }
            jobscraper_serializer = JobscraperSerializer(data=job_load)
            if jobscraper_serializer.is_valid():
                jobscraper_serializer.save()
                i += 1
                if i == 2:
                    jobscraper = ITjob.objects.all()
                    job_title = request.GET.get('job_title', None)
                    if job_title is not None:
                        jobscraper = jobscraper.filter(
                            job_title__icontains=job_title)
                    jobscraper_serializer = JobscraperSerializer(jobscraper,
                                                                 many=True)
                    return JsonResponse(jobscraper_serializer.data,
                                        safe=False,
                                        status=status.HTTP_201_CREATED)
            else:
                print(jobscraper_serializer.errors)
Beispiel #35
0
#Importing the user input image
imageFileName = input("enter the image name with absolute path ")

# In[31]:

#openin the image file path that we have given
img = Image.open(imageFileName)

# In[32]:

#coverting the image text into string
text = tss.image_to_string(img)

# In[33]:

#printing the converted text
print(text)

# In[35]:

# By using datefinder funtion we are extracting dates from text
matches = datefinder.find_dates(text)

# In[36]:

for match in matches:
    print(match)

# In[ ]:
    def parse_data(self, file=None):
        if not file:
            file = self.file
        text_pdf = self.pdf2text(file)
        log(log.INFO, "Read pdf file Union")
        if not text_pdf:
            viewer = SimplePDFViewer(file)
            for canvas in viewer:
                text_pdf += "".join(canvas.strings)
        log(log.INFO, "Get pdf text Union")

        matches = datefinder.find_dates(text_pdf)

        COUNT_FIND_DATE = 2
        date = datetime.now()
        for i, match in enumerate(matches):
            date = match
            if i >= COUNT_FIND_DATE:
                break

        last_skip_word = "% Chg"
        try:
            skip_index = text_pdf.rindex(last_skip_word) + len(last_skip_word)
        except ValueError:
            skip_index = 0

        text_pdf = text_pdf[skip_index:].strip()

        PATTERN = (
            r"(?P<name>[a-zA-Z0-9_\ \(\)\.\&\,\-]+)\s+"
            r"(?P<w_current_year>[0-9\,]+)\s+"
            r"(?P<w_previous_year>[0-9\,]+)\s+"
            r"(?P<w_chg>[0-9\.\%\-\(\)]+)\s+"
            r"(?P<q_current_year>[0-9\,]+)\s+"
            r"(?P<q_previous_year>[0-9\,]+)\s+"
            r"(?P<q_chg>[0-9\.\%\-\(\)]+)\s+"
            r"(?P<y_current_year>[0-9\,]+)\s+"
            r"(?P<y_previous_year>[0-9\,]+)\s+"
            r"(?P<y_chg>[0-9\.\%\-\(\)]+)"
        )

        # list of all products
        products = {}
        for line in re.finditer(PATTERN, text_pdf):
            products[line["name"].strip()] = dict(
                week=dict(
                    current_year=get_int_val(line["w_current_year"]),
                    previous_year=get_int_val(line["w_previous_year"]),
                    chg=line["w_chg"],
                ),
                QUARTER_TO_DATE=dict(
                    current_year=get_int_val(line["q_current_year"]),
                    previous_year=get_int_val(line["q_previous_year"]),
                    chg=line["q_chg"],
                ),
                YEAR_TO_DATE=dict(
                    current_year=get_int_val(line["y_current_year"]),
                    previous_year=get_int_val(line["y_previous_year"]),
                    chg=line["y_chg"],
                ),
            )

        for prod_name in products:
            company_id = ""
            carload_id = find_carload_id(prod_name)
            company_id = f"Union_Pacific_{self.year_no}_{self.week_no}_{carload_id}"
            company = Company.query.filter(
                and_(
                    Company.company_id == company_id, Company.product_type == prod_name
                )
            ).first()
            if not company and carload_id is not None:
                Company(
                    company_id=company_id,
                    carloads=products[prod_name]["week"]["current_year"],
                    YOYCarloads=products[prod_name]["week"]["current_year"]
                    - products[prod_name]["week"]["previous_year"],
                    QTDCarloads=products[prod_name]["QUARTER_TO_DATE"]["current_year"],
                    YOYQTDCarloads=products[prod_name]["QUARTER_TO_DATE"][
                        "current_year"
                    ]
                    - products[prod_name]["QUARTER_TO_DATE"]["previous_year"],
                    YTDCarloads=products[prod_name]["YEAR_TO_DATE"]["current_year"],
                    YOYYDCarloads=products[prod_name]["YEAR_TO_DATE"]["current_year"]
                    - products[prod_name]["YEAR_TO_DATE"]["previous_year"],
                    date=date,
                    week=self.week_no,
                    year=self.year_no,
                    company_name="UNION",
                    carload_id=carload_id,
                    product_type=prod_name,
                ).save()
        log(log.INFO, "Write data to the database UNION")
def upload():
    headers_post = request.headers
    appid = headers_post['appId']
    tenant_id = headers_post['X-TenantID']
    object_id = headers_post['X-Object']
    Authorization = headers_post['Authorization']

    #    dct_prop = dict(line.strip().split('=') for line in open('properties.txt'))
    URL = os.getenv("properties_url")
    response1 = requests.get(URL,
                             headers={
                                 "Content-Type": "application/json",
                                 "X-TenantID": tenant_id
                             })
    ENV_URL = response1.json()
    ENV_URL = ENV_URL["propertyValue"]
    vf_url = str(ENV_URL) + "/cac-security/api/userinfo"
    response = requests.get(
        vf_url, headers={"Authorization": Authorization}
    )  #    df1=pd.read_csv("C:/Users/kartik.patnaik/Desktop/mobileapp/new_test/stanford-ner-2018-10-16/train2/Book1.csv")
    if response.status_code == 200:
        ROOT_PATH = os.getenv("path_root_url")
        os.chdir(ROOT_PATH)
        df2 = str(tenant_id) + "/" + str(appid) + "/" + str(object_id)
        user_input = request.get_json()
        if user_input != {}:
            wanted_keys = ['sentence']
            wanted_keys1 = ['Tags']
            sentence = {
                k: user_input[k]
                for k in set(wanted_keys) & set(user_input.keys())
            }
            sentence = list(sentence.values())[0]
            if sentence != None and sentence != '':
                sentence = sentence.lower()
                article = sentence[:]

                def find_match(sentence, df):
                    for i in range(df.shape[0]):
                        if sentence.find(df['rpl'][i]) != -1:
                            sentence = sentence[:sentence.find(
                                df['rpl'][i])] + df['rpl1'][
                                    i] + sentence[sentence.find(df['rpl'][i]) +
                                                  len(df['rpl'][i]):]
                    return sentence

                ls3 = list(datefinder.find_dates(sentence, source=True))

                if ls3 != []:
                    ls4 = pd.DataFrame(ls3)
                    ls4.columns = ["rpl1", "rpl"]
                    ls4["rpl1"] = ls4["rpl1"].dt.strftime('%Y-%m-%d')
                    sentence = find_match(article, ls4)

                tags = {
                    k: user_input[k]
                    for k in set(wanted_keys1) & set(user_input.keys())
                }
                tags = list(tags.values())[0]

                def lower_dict(d):
                    new_dict = dict((k, v.lower()) for k, v in d.items())
                    return new_dict

                tags = lower_dict(tags)
                new_list = []
                for key, value in tags.items():
                    new_list.append([key, value])
                ui1 = pd.DataFrame(new_list)
                ui1.columns = ['action', 'sentence']
                #ui2 = ui1.sentence.str.split(expand=True,)
                ui1[['sentence1',
                     'sentence2']] = ui1['sentence'].str.split(' ',
                                                               n=1,
                                                               expand=True)
                ui2 = ui1[['sentence1', 'action']]
                ui3 = ui1[['sentence2', 'action']]
                #ui3.dropna(subset=['action'],inplace = True)
                ui3.dropna(inplace=True)
                ui2.columns = ['sentence', 'action']
                ui3.columns = ['sentence', 'action']
                ui4 = ui2.append(ui3, ignore_index=True)
                lst_ip1 = nltk.word_tokenize(sentence)
                lst_ip3 = pd.DataFrame(lst_ip1)
                lst_ip3.columns = ['sentence']

                #################################################join
                result = pd.merge(lst_ip3, ui4, on='sentence', how='left')

                result['action'] = result['action'].fillna('o')
                result['sentence'] = result['sentence'].map(
                    str) + " " + result["action"]
                user_input3 = result['sentence']
                user_input3.to_csv(str(df2) + '/user_input3.tsv',
                                   header=False,
                                   index=False)
                user_input3 = pd.read_csv(str(df2) + '/user_input3.tsv',
                                          sep='\t',
                                          header=None)
                exists = os.path.isfile(str(df2) + '/dummy-corpus1.tsv')
                exists1 = os.path.isfile(str(df2) + '/dummy-corpus2.tsv')
                if exists and not exists1:
                    pa1 = pd.read_csv(str(df2) + '/dummy-corpus1.tsv',
                                      sep='\t',
                                      header=None)
                    pa2 = pa1.append(user_input3, ignore_index=True)
                    pa2 = pa2.append([". o"])
                elif exists1 and exists:
                    pa1 = pd.read_csv(str(df2) + '/dummy-corpus2.tsv',
                                      sep='\t',
                                      header=None)
                    pa2 = pa1.append(user_input3, ignore_index=True)
                    pa2 = pa2.append([". o"])
                else:
                    pa2 = user_input3
                    pa2 = pa2.append([". o"])

                pa2.to_csv(str(df2) + '/dummy-corpus2.tsv',
                           header=False,
                           index=False)
                cwd = os.getcwd()
                cwd = pathlib.PureWindowsPath(cwd)
                cwd = cwd.as_posix()
                prop = "trainFile = " + str(cwd) + "/" + str(
                    df2) + """/dummy-corpus2.tsv
                serializeTo =""" + str(cwd) + "/" + str(
                        df2) + """/corpus-tagging.ser.gz
                map = word=0,answer=1
                
                useClassFeature=true
                useWord=true
                useNGrams=true
                noMidNGrams=true
                maxNGramLeng=6
                usePrev=true
                useNext=true
                useSequences=true
                usePrevSequences=true
                maxLeft=1
                useTypeSeqs=true
                useTypeSeqs2=true
                useTypeySequences=true
                wordShape=chris2useLC
                useDisjunctive=true"""

                file = open(str(cwd) + "/" + str(df2) + '/prop2.txt', 'w')
                file.write(prop)
                file.close()
                myCmd = 'java -jar stanford-ner.jar -mx4g -prop' " " + str(
                    df2) + '/prop2.txt'
                os.system(myCmd)

                return 'Recurrent Training on Completed Successfully'
            else:
                return 'No Data to be trained on NULL'
    else:
        return 'Unsuccessful Auth'
Beispiel #38
0
    def __svo_senti_from_article(self, article, subject=None):
        title = article[0:article.find('(title_end)')]
        try:
            date = list(datefinder.find_dates(article))[-1]
        except:
            date = None
        sentences = self.__sentence_split(article)
        val1 = []
        val2 = []

        for sent in sentences:
            val1.append(self.__sentimentAnalysis(sent))
            val2.append(self.__get_svo(sent))
        result = pd.merge(pd.DataFrame(val1),
                          pd.DataFrame(val2),
                          on='Sentence')[[
                              'Sentence', 'Names', 'Persons', 'Organizations',
                              'Facilities', 'Locations', 'Subjects',
                              'Predicates', 'Objects', 'compound', 'Event_date'
                          ]]
        result.rename(columns={'compound': 'Sentiment'}, inplace=True)
        #        try:
        #            result['date']=date
        #        except:
        #            result['date']='-----'
        result['Article_date'] = date
        result['Article_title'] = title

        def correctdate(eventdate, articledate):
            if eventdate is None:
                return None
            if articledate is None:
                return None
            try:
                corrected_date = parse(eventdate,
                                       settings={'RELATIVE_BASE': articledate})
            except:
                corrected_date = None
            return corrected_date

        result['Event_date'] = result['Event_date'].apply(
            lambda x: correctdate(x, date))
        #        try:
        #            result.loc[result['date']> datetime.datetime.today() + datetime.timedelta(days=1),'date']='-----'
        #        except:
        #            pass
        result = result.drop_duplicates(subset=['Sentence'],
                                        keep='first')  # remove duplicate rows
        '''
        ###emolex start
        def getEmolex(word):
            wordlist=re.findall(r'\w+', word)
            wordlist=[e.lower() for e in wordlist]
            df=pd.DataFrame(columns=list(self.emolexdict['type'].unique()))

            dflist=[]
            for e in wordlist:

                temp=self.emolexdict[self.emolexdict['word']==e]
                pivot=temp.pivot(index='word', columns='type', values='Weight').reset_index()
                dflist.append(pivot)
            result=pd.concat(dflist)
            features=list(result)
            features.remove('word')
            df[features]=result[features]
            df['Sentence']=word

            final=df.groupby('Sentence').apply(np.mean).reset_index()
            return final

        emolex_all=[]
        for sent in result['Sentence']:
            dft=getEmolex(sent)
            emolex_all.append(dft)

        result_emolex=pd.concat(emolex_all)
        result=result.join(result_emolex.set_index('Sentence'),on='Sentence')
        ###emolex end
        '''
        if subject is None:
            return result
        else:
            return result[result['Names'].apply(lambda x: subject in x)]
for i in range(len(df)):
    if len(df[i]) * len(df[i].columns) > maxI:
        maxI = len(df[i]) * len(df[i].columns)
        bestIndex = i

dfOfInterest = df[bestIndex]

dfOfInterest = pd.DataFrame(dfOfInterest).to_numpy()

datesArray = []

for i in range(len(dfOfInterest)):
    for j in range(len(dfOfInterest[i])):

        matches = list(datefinder.find_dates(str(dfOfInterest[i][j])))
        print(matches)
        print((i, j))
        '''
        if is_date(str(dfOfInterest[i][j])):
            datesArray.append((dfOfInterest[i][j], (i,j)))
        '''
'''

# Contains tuples of the dates text and where the index of that date 
# is in textArray to be used for parsing later
datesArray = []

for i in range(len(textArray)):
    if is_date(textArray[i]):
        datesArray.append((textArray[i], i))
Beispiel #40
0
    def __get_svo(self, sentence):
        '''
        get SVO of single sentence
        '''
        parsed_phrase = self.__nlp(sentence)
        names = list(parsed_phrase.ents)
        corrected_names = []
        persons = []
        locations = []
        organizations = []
        event_date = []
        norp = []
        facilities = []
        events = []
        cities = []
        for e in names:
            if e.label_ == 'GPE' or e.label == 'LOC' or e.label_ == 'PERSON' or e.label_ == 'ORG' or e.label == 'NORP' \
                    or e.label == 'FACILITY' or e.label == 'PRODUCT':
                corrected_names.append(e.text)
            if e.label_ == 'GPE' or e.label == 'LOC':
                locations.append(e.text)
            # if e.text.lower() in self.allcities:   # detect cities, slowdone the speed
            #                    cities.append(e.text)
            if e.label_ == 'PERSON':
                persons.append(e.text)
            if e.label_ == 'ORG':
                organizations.append(e.text)
            if e.label == 'NORP':
                norp.append(e.text)
            if e.label == 'FACILITY' or e.label == 'PRODUCT':
                facilities.append(e.text)
            if e.label == 'EVENT':
                events.append(e.text)

        subjects = []
        objects = []
        verbs = []
        for text in parsed_phrase:
            if text.dep_.startswith("nsubj") or text.dep_ in ['conj']:
                subject = text.orth_
                subjects.append(subject)
            if text.dep_ in ["dobj", 'pobj', 'iobj']:
                object_ = text.orth_
                objects.append(object_)
            if text.pos_ == "VERB" and text.lemma_ in self.__keyverbs:
                verb = text.lemma_
                verbs.append(verb)

        # event date
        try:
            event_date = list(
                set(sentence.replace('.', '').split()) & {
                    'Monday', 'Tuesday', 'Wednesday', 'Tursday', 'Friday',
                    'Saturday', 'Sunday', 'Today', 'today', 'Tomorrow',
                    'tomorrow', 'Yesterday', 'yesterday'
                })[0]

        except:
            try:
                event_date = list(datefinder.find_dates(sentence))[0]
                if str(event_date.year) not in sentence:
                    event_date = str(event_date.month) + '/' + str(
                        event_date.day)
                event_date = str(event_date)
            except:
                event_date = None

        # correct subject and object
        corrected_subjects = []
        corrected_objects = []
        corrected_names_copy = list(corrected_names)
        for sub in subjects:
            for name in corrected_names_copy:
                if sub in name:
                    corrected_subjects.append(name)
                    corrected_names_copy.remove(name)
                    break
        for obj in objects:
            for name in corrected_names_copy:
                if obj in name:
                    corrected_objects.append(name)
                    corrected_names_copy.remove(name)
                    break

        return {
            'Sentence': sentence,
            'Subjects': corrected_subjects,
            'Predicates': verbs,
            'Objects': corrected_objects,
            'Names': corrected_names,
            'Event_date': event_date,
            'Persons': persons,
            'Locations': locations,
            # 'Cities': cities,
            'Organizations': organizations,
            'NORP': norp,
            'Facilities': facilities,
            'Events': events
        }
    def date(self):
        new_text = re.sub('([0-9]+)[\s-]years?[\s-]old', '', self.description)
        dates = datefinder.find_dates(new_text)
        # datefinder is throwing false positives for 38-years-old

        return dates.next().strftime("%B %d, %Y")
Beispiel #42
0
            break

    ## extracting email from string
    match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    email = match.group(0)
    # print(email)

    #date
    recieved = 'received'
    for idx, text in enumerate(string):
        if recieved in text.lower():
            # print(idx+1, string[idx+1])
            break

    ## extracting date from string
    matches = datefinder.find_dates(string[idx + 1])
    for match in matches:
        date = match.strftime('%m/%d/%Y')
        # print match

    #retailer information
    retailers = ['nordstrom']

    #getting retailer information
    for num in retailers:
        for idx, text in enumerate(string):
            if num in text.lower():
                retailer = num
                break

    # print retailer