def __init__(self, page, raw): self.page = page self.raw = raw # most fields aside from id, type, ctime # and mtime are optional self.graph = page.graph.partial(raw['id']) self.id = raw['id'] self.type = raw['type'] self.created_time = utils.date.parse(raw['created_time']) self.updated_time = utils.date.parse(raw['updated_time']) self.name = raw.get('name') self.story = raw.get('story') self.link = raw.get('link') self.message = raw.get('message') self.description = raw.get('description') self.shares = raw.get('shares') # TODO: figure out if *all* comments and likes are included # when getting post data, or just some self.comments = utils.api.getdata(raw, 'comments') self.likes = utils.api.getdata(raw, 'likes') self.quotes = \ utils.extract_quotes(self.message or '') + \ utils.extract_quotes(self.description or '') # `self.link` is part of Facebook's post schema # `self.links` extracts links from the message and # the description of any embedded media self.links = set( utils.extract_links(self.message or '') + \ utils.extract_links(self.description or '') ) if self.link: self.links.add(self.link) if 'picture' in raw: self.picture = Picture(self, raw['picture']) else: self.picture = None
def reindex_page(self, page, title, text=None): """Updates the content of the database, needs locks around.""" if text is None: _get_text = getattr(page, '_get_text', lambda: u'') try: text = _get_text() except NotFoundErr: text = None title_id = self.title_id(title) if not list(self.page_backlinks(title)): self.db.query(Title).filter(Title.id==title_id).delete() if text is not None: links = extract_links(text) else: links = [] self.update_links(title, links) self.update_words(title, text or u'')
def _crawl(self): print "Getting info for %s" % self.seed html = get_html(self.seed) self.index = extract_links(html)
content = None # load the content from the co-sponsor page while content == None: try: content = get_content(source_doc, requests_session) except requests.exceptions.ConnectionError as e: print e print ' Connection failed. Retrying...' requests_session = requests.session() except Exception as e: print e # loop over the links extracted from the co-sponsor page for link in extract_links(content, bill.co_sponsor_link): if 'District' in link['name']: # parse the name out of the link text parse_link_name = parse_name(link['name']) if parse_link_name['success']: # find the assembly member record # first, query with the parsed last name, the district, the chamber and the assembly member_query = (Assembly_Member .select() .join(Person) .where( (Person.last_name == parse_link_name['name_dict']['last_name']) & (Assembly_Member.district == parse_link_name['name_dict']['district'])
content = None # load the content from the co-sponsor page while content == None: try: content = get_content(source_doc, requests_session) except requests.exceptions.ConnectionError as e: print e print " Connection failed. Retrying..." requests_session = requests.session() except Exception as e: print e # loop over the links extracted from the co-sponsor page for link in extract_links(content, bill.co_sponsor_link): if "District" in link["name"]: # parse the name out of the link text parse_link_name = parse_name(link["name"]) if parse_link_name["success"]: # find the assembly member record # first, query with the parsed last name, the district, the chamber and the assembly member_query = ( Assembly_Member.select() .join(Person) .where( (Person.last_name == parse_link_name["name_dict"]["last_name"]) & (Assembly_Member.district == parse_link_name["name_dict"]["district"])