def get_basic_info(root=None): sex = u'?' birthdate = None try: basic_info = root.find_element(By.ID, "basic-info") except NoSuchElementException: print "[ERROR]: Could not find 'basic-info' section" return sex, birthdate try: sex_elem = basic_info.find_element( By.XPATH, "//div[@title='Sexo']/table/tbody/tr/td[2]/div" ) if sex_elem.text == 'Hombre': sex = u'H' elif sex_elem.text == 'Mujer': sex = u'M' else: sex = u'?' except NoSuchElementException: print "[ERROR]: No information about gender" try: birthdate_elem = basic_info.find_element( By.XPATH, "//div[@title='Fecha de nacimiento']/table/tbody/tr/td[2]/div" ) birthdate = parse_date(birthdate_elem.text) except NoSuchElementException: print "[ERROR]: No information about birthdate" except UnrecognizedDateFormatError as e: print "[ERROR]: Unrecognized date format for '{0}'".format(e.date) return sex, birthdate
def group_by_period(queryset, column, period, **annotate): """ Group and annotate given queryset by a given date period. :param queryset: Original queryset :type queryset: django.db.QuerySet :param column: Column for grouping :type column: str :param period: Period for grouping ('year', 'month', 'day') :type period: str :param annotate: Dict for `.annotate()` :type annotate: dict[str,str] :return: OrderedDict of period -> annotate columns :rtype: collections.OrderedDict """ # Based on http://stackoverflow.com/a/8746532/51685 d = OrderedDict() for line in ( queryset .extra({"period_group": connection.ops.date_trunc_sql(period, column)}) .values("period_group") .annotate(**annotate) .order_by("period_group") .values(*["period_group"] + list(annotate.keys())) ): d[parse_date(line.pop("period_group"))] = line return d
def test_dates(self): for test in ['01/01/09', '1/1/09', '1/1/2009', 'Jan 1, 09', 'Jan 1, 2009', 'January 1, 2009', '2009-01-01', '1.1.09']: dt = parse_date(test) self.assertNotEqual(dt, None, test) if dt is not None: self.assertEqual(dt.date(), datetime.date(2009,1,1), test)
def _set_datetime_value(self, new_value): if self.attribute.type == AttributeType.DATETIME: # Just store datetimes if not isinstance(new_value, datetime.datetime): raise TypeError("Can't assign %r to DATETIME attribute" % new_value) self.datetime_value = new_value self.numeric_value = calendar.timegm(self.datetime_value.timetuple()) self.untranslated_string_value = self.datetime_value.isoformat() elif self.attribute.type == AttributeType.DATE: # Store dates as "date at midnight" date = parse_date(new_value) self.datetime_value = datetime.datetime.combine(date=date, time=datetime.time()) self.numeric_value = date.toordinal() # Store date ordinal as numeric value self.untranslated_string_value = date.isoformat() # Store date ISO format as string value
def parse(self, response): user = FacebookUser() # Get the ID user['id'] = FacebookSpider.id_from_url(response.url).decode('utf-8') # Get the name try: user['name'] = response.xpath('//title/text()').extract()[0] root = response\ .xpath("(//div[@id='root' and descendant::div[@id='contact-info']])[1]") except IndexError: self.log("Impossible to determine name of user. Skipping", log.CRITICAL) return # Get the picture user['image_urls'] = [] if 'anabel' in user['name'].lower(): try: picture_url = root.xpath("//img[parent::a[contains(@href, 'photo.php')]]/@src") user['image_urls'].append(picture_url.extract()[0]) except IndexError: pass living_info = root.xpath("div/div[@id='living']") # Get location location_link = living_info\ .xpath( "//a[ancestor::div[@title='Ciudad actual'] and contains(@href, '{0}')]"\ .format(FacebookSpider.path_profile) ) try: location_url = urlparse(location_link.xpath("@href").extract()[0]) location = FacebookCity() location['id'] = '/' + parse_qs(location_url.query).get('id')[0] location['name'] = location_link.xpath("text()").extract()[0] user['location_id'] = location['id'] yield location except IndexError: user['location_id'] = None # Get birthplace birthplace_link = living_info\ .xpath( "//a[ancestor::div[@title='Ciudad de origen'] and contains(@href, '{0}')]"\ .format(FacebookSpider.path_profile) ) try: birthplace_url = urlparse(birthplace_link.xpath("@href").extract()[0]) birthplace = FacebookCity() birthplace['id'] = '/' + parse_qs(birthplace_url.query).get('id')[0] birthplace['name'] = birthplace_link.xpath("text()").extract()[0] user['birthplace_id'] = birthplace['id'] yield birthplace except IndexError: user['birthplace_id'] = None basic_info = root.xpath("div/div[@id='basic-info']") # Get sex try: sex = basic_info\ .xpath("//tr[ancestor::div[@title='Sexo']]/td[2]/div/text()")\ .extract()[0] if sex == 'Hombre': sex = u'H' elif sex == 'Mujer': sex = u'M' else: sex = u'?' user['sex'] = sex except IndexError: user['sex'] = u'?' # Get birthdate try: birthdate = basic_info\ .xpath("//tr[ancestor::div[@title='Fecha de nacimiento']]/td[2]/div/text()")\ .extract()[0] user['birthdate'] = parse_date(birthdate) except (UnrecognizedDateFormatError, IndexError): user['birthdate'] = None # Get friends URL self.friends_url = FacebookSpider.friends_url(user['id']) user['friends'] = [] # Set exploration depth level try: level = int(FacebookSpider.max_depth)\ if not 'level' in response.meta\ else response.meta['level'] except: raise Exception("Could not determine the desired exploration depth level") # Check if we continue going down the tree or not if level == 0: yield user else: request = Request(url=self.friends_url, callback=self.parse_friends) request.meta['user'] = user request.meta['level'] = level - 1 yield request
def test_times(self): for test in ['1/1/09 13:17', '1/1/09 1:17 pm']: dt = parse_date(test) self.assertNotEqual(dt, None, test) if dt is not None: self.assertEqual(dt, datetime.datetime(2009,1,1,13,17))