Example #1
0
 def test_date_extraction(self):
     self.assertEqual(extract_date('02.01.2010'),
                      datetime(2010, 1, 2).date())
     self.assertEqual(extract_date('2.1.2010'),
                      datetime(2010, 1, 2).date())
     self.assertEqual(extract_date('2.1.20100'), '')
     self.assertEqual(extract_date('2.1.20'), '')
     self.assertEqual(extract_date(''), '')
def parse_arrete(filename):
    with open(filename, 'r') as input:
        soup = BeautifulSoup(input.read().decode('utf-8'))

        prev_article = None

        data = {'refs': [], 'articles': []}

        for tag in soup.findAll(['p']):
            line = tag.text.replace('\n', ' ').strip()

            if not line:
                continue

            if re_end.search(line) and not re_date.search(line):
                continue

            if re_date.search(line):
                try:
                    data['date'] = extract_date(line)
                except:
                    pass

            if re_end.search(line) and re_date.search(line):
                data['date'] = extract_date(line)
                break

            if line.startswith('Portant'):
                data['titre'] = line

            numero = re_numero_arrete.search(line)
            if numero:
                data['numero'] = numero.group(1).strip()
                continue

            reference = re_references.search(line)
            if reference:
                data['refs'].append(line[:-1])
                continue

            current_article = re_articles.search(line)

            if current_article:
                data['articles'].append(line)
                prev_article = current_article
                continue

            if prev_article:
                data['articles'][-1] += ' ' + line

        return data
Example #3
0
 def create_task(self, args):
     """Create a document with de args provided"""
     doc = {}
     doc['id'] = self.get_free_id()
     try:
         doc['description'] = ' '.join(args['description'])
     except TypeError:
         doc['description'] = ' '.join(args['parameter'])
     if args['project']:
         doc['project'] = args['project']
     else:
         doc['project'] = 'default'
     doc['status'] = "incomplete"
     doc['date'] = datetime.now()
     doc['priority'] = prioritize(args['priority'])
     try:
         doc['tags'] = clean_tags(args['tags'])
     except TypeError:
         pass
     try:
         actual_date = extract_date(args['due_date'])
         if actual_date:
             doc['due_date'] = actual_date
     except TypeError:
         pass
     try:
         parent_id = self.get_id(int(args['parent']))
         doc['parent'] = parent_id
         doc['ancestors'] = self.get_ancestors(parent_id)
     except TypeError:
         doc['parent'] = None
     return doc
  def query_manifests(self, depot_id):
    """Query steamdb.info for a list of manifests for a specific depot and return that list.

    Returns a list of manifests
    """
    url_base = "https://steamdb.info/"
    response = requests.get(self.__build_url(url_base, f"depot/{depot_id}/manifests/"), headers=self.headers)
    result = []

    if self.__is_response_successful(response):
      self.__print_response_error(response)
      sys.exit()

    soup = BeautifulSoup(response.content, "html.parser")
    div = soup.find("div", {'id' : 'manifests'})
    tbody = div.find("tbody")

    # Prevent Error for depots without history
    if not tbody is None:    
      for tr in tbody.findAll("tr"):
        tds = tr.findAll("td")

        date = utils.extract_date(tds[0].text)
        id = tds[2].text

        result.append({ 'date' : date, 'id' : id })

    return result
Example #5
0
 def create_task(self, args):
     """Create a document with de args provided"""
     doc = {}
     doc['id'] = self.get_free_id()
     try:
         doc['description'] = ' '.join(args['description'])
     except TypeError:
         doc['description'] = ' '.join(args['parameter'])
     if args['project']:
         doc['project'] = args['project']
     else:
         doc['project'] = 'default'
     doc['status'] = "incomplete"
     doc['date'] = datetime.now()
     doc['priority'] = prioritize(args['priority'])
     try:
         doc['tags'] = clean_tags(args['tags'])
     except TypeError:
         pass
     try:
         actual_date = extract_date(args['due_date'])
         if actual_date:
             doc['due_date'] = actual_date
     except TypeError:
         pass
     try:
         parent_id = self.get_id(int(args['parent']))
         doc['parent'] = parent_id
         doc['ancestors'] = self.get_ancestors(parent_id)
     except TypeError:
         doc['parent'] = None
     return doc
Example #6
0
 def test_extract_date(self):
     app = MagicMock()
     test_cases = [("I will travel on 05/09/2020", '05/09/2020'),
                   ("on 05/09/2021 and on 03/09/2021", False),
                   ("2021/06/30", False), ("13/05/2021", '13/05/2021'),
                   ("on 13/05/2021", '13/05/2021')]
     for option, answer in test_cases:
         self.assertEqual(extract_date(option, app), answer)
Example #7
0
 def __init__(self, **kwargs):
     user_data = {
         'full_name': '{} {}'.format(
             kwargs.get('last_name', ''), kwargs.get('first_name', '')
         ),
         'first_name': kwargs.get('first_name', ''),
         'last_name': kwargs.get('last_name', ''),
         'bdate': extract_date(kwargs.get('bdate', '')),
         'phone': extract_phone(kwargs.get('home_phone', '')),
         'nickname': kwargs.get('nickname', ''),
         'site': kwargs.get('site', ''),
     }
     super().__init__(**user_data)
def job_parser(file):
    imageString = pytesseract.image_to_string(Image.open(file), lang="eng")
    raw_text = imageString.replace('\n', ' ')
    input_file_name = file.split("/")[-1]
    date_of_posting = utils.extract_date(''.join(file.split('_')[1:3]))

    details_dict = {
        "file_name": input_file_name,
        "date_of_posting": date_of_posting
    }

    # How many Connects required
    req_con = []

    # duration of the job
    job_duration = utils.extract_job_duration(raw_text)
    details_dict["job_duration"] = job_duration

    # Level of expertise
    exp_level = utils.experience_level(raw_text)
    details_dict["experience_level"] = exp_level

    # Duration of the project
    job_desc = utils.extract_job_description(imageString)
    details_dict["job_description"] = job_desc

    # Job Related to (DL/ML/DA)
    job_main_skill = utils.job_main_skill(imageString)
    details_dict["job_main_skill"] = job_main_skill

    # Job detailed description
    full_desc = utils.extract_full_job_description(raw_text)
    details_dict["job_full_description"] = full_desc

    # Skills required
    skill_req = np.NAN
    details_dict["required_skill"] = str(skill_req)

    # How many Connects required
    temp_conn = raw_text.split('Worldwide')[1]
    if 'Send a proposal for:' in temp_conn:
        req_con = temp_conn.split('Send a proposal for:')[1].split()[0]
        details_dict["required_connects"] = str(req_con)
    else:
        req_con = raw_text.split('Worldwide')[1].split()[0]  # working
        details_dict["required_connects"] = str(req_con)

    # print(details_dict)

    return details_dict
Example #9
0
 def _createOpinions(self, hotel_id):
     for opinion in self.opinions:
         opinionObj = ed.model.Opinion()
         opinionObj.user = opinion['name']
         opinionObj.country = self._getOrCreateCountry(opinion['country']).id if opinion['country']!=None else None
         opinionObj.age_range = self._getOrCreateAgeRange(opinion['age_range'])
         opinionObj.date = extract_date(opinion['date'])
         opinionObj.hotel_id = hotel_id
         opinionObj.positive = opinion['positive']
         opinionObj.negative = opinion['negative']
         opinionObj.grade = opinion['grade'].replace(",",".")
         opinionObj.title = opinion['title']
         opinionObj.user_opinions = opinion['visits']
         self.session.add(opinionObj)
         self.session.flush()
         tags = map(lambda tag: self._getOrCreateTags(tag), opinion['tags'])
         optags = map(lambda tag: ed.model.OpinionTag(tag=tag.id,opinion=opinionObj.id),tags)
         self.session.add_all(optags)
         self.session.flush()
Example #10
0
  def query_manifests(self, depot_id):
    """Query steamdb.info for a list of manifests for a specific depot and return that list.

    Returns a list of manifests
    """
    url = f"https://steamdb.info/depot/{depot_id}/manifests/"
    response = self._query_website(url, headers=self.headers)
    result = []

    soup = BeautifulSoup(response.content, "html.parser")
    div = soup.find("div", {'id' : 'manifests'})
    tbody = div.find("tbody")

    # Prevent Error for depots without history
    if not tbody is None:    
      for tr in tbody.findAll("tr"):
        tds = tr.findAll("td")

        date = utils.extract_date(tds[0].text)
        id = tds[2].text

        result.append({ 'date' : date, 'id' : id })

    return result
Example #11
0
def extract_reviews_from_url(url):
    try:
        # Avoid raising exception... In the except section
        review_div = None

        logger.debug('Getting reviews from %s', url)
        # Clean end of URL to get the main Beer URL
        beer_url = url
        if '?' in url:
            beer_url = re.sub(r'\?.*', '', url)

        # Kept begin of review order int
        review_order = 0
        if 'start=' in url:
            parse_result = urlparse(url)
            query_dict = parse_qs(parse_result.query)
            try:
                review_order = int(query_dict['start'][0])
            except:
                logger.error('Could not get start index. Query string: '+str(query_dict))
                raise

        empty_rating_dict = {'beer_url': beer_url, 'user_url': '',
                             'score': '', 'rdev': '', 'date': '',
                             'review': '', 'scrap_time': '', 'review_order': ''}

        list_ratings_reviews = []
        soup = make_soup(url)

        review_divs = soup.findAll(id='rating_fullview_content_2')

        for review_div in review_divs:

            rating_dict = empty_rating_dict.copy()
            rating_dict['review_order'] = str(review_order)
            review_order += 1

            now = datetime.datetime.now()
            rating_dict['scrap_time'] = str(now)

            # Date
            muted = review_div.find_all(class_='muted')
            date = muted[-1].find_all('a')[-1].contents[0]
            real_date = utils.extract_date(date, current_date=now)
            rating_dict['date'] = str(real_date)

            # user url
            rating_dict['user_url'] = review_div.find(class_='username')['href']

            # score
            bascore = review_div.find(class_='BAscore_norm')
            rating_dict['score'] = bascore.contents[0]

            # Now we'll process line by line... Always ugly
            # rdev - useful?
            norm_line = review_div.find(class_='rAvg_norm')

            rdev_line = norm_line.next_sibling
            rdev_string = rdev_line.string
            # Need to take into account rDev 0%
            if not '%' in rdev_string:
                rdev_line = rdev_line.next_sibling
                rdev_string = rdev_line.string
            rdev = rdev_line.string.replace('%', '').replace('rDev', '').strip()
            rating_dict['rdev'] = rdev

            # If there is a review, then we have more info
            next_el = rdev_line.next_sibling
            next_el_sibl = next_el.next_sibling
            current_el = next_el_sibl

            # Get all siblings, in any case
            all_siblings = current_el.next_siblings
            # Remove all tags from the siblings
            true_siblings = [x for x in all_siblings
                             if isinstance(x, NavigableString) or not x.name]

            # It's a review, let's parse it
            review_string = " ".join(true_siblings[0: -2])
            rating_dict['review'] = review_string

            list_ratings_reviews.append(rating_dict)

        return list_ratings_reviews

    except Exception:
        logger.error('Error fetching reviews and ratings from %s' % url)
        if review_div:
            logger.error('Div: ')
            logger.error(review_div)
        raise