Example #1
0
    def __init__(self, page, raw):
        self.page = page
        self.raw = raw
        # most fields aside from id, type, ctime
        # and mtime are optional
        self.graph = page.graph.partial(raw['id'])
        self.id = raw['id']
        self.type = raw['type']
        self.created_time = utils.date.parse(raw['created_time'])
        self.updated_time = utils.date.parse(raw['updated_time'])
        self.name = raw.get('name')
        self.story = raw.get('story')
        self.link = raw.get('link')
        self.message = raw.get('message')
        self.description = raw.get('description')
        self.shares = raw.get('shares')
        # TODO: figure out if *all* comments and likes are included
        # when getting post data, or just some
        self.comments = utils.api.getdata(raw, 'comments')
        self.likes = utils.api.getdata(raw, 'likes')
        self.quotes = \
            utils.extract_quotes(self.message or '') + \
            utils.extract_quotes(self.description or '')
        # `self.link` is part of Facebook's post schema
        # `self.links` extracts links from the message and
        # the description of any embedded media
        self.links = set(
            utils.extract_links(self.message or '') + \
            utils.extract_links(self.description or '')
            )
        if self.link:
            self.links.add(self.link)

        if 'picture' in raw:
            self.picture = Picture(self, raw['picture'])
        else:
            self.picture = None
Example #2
0
    def reindex_page(self, page, title, text=None):
        """Updates the content of the database, needs locks around."""

        if text is None:
            _get_text = getattr(page, '_get_text', lambda: u'')
            try:
                text = _get_text()
            except NotFoundErr:
                text = None
                title_id = self.title_id(title)
                if not list(self.page_backlinks(title)):
                    self.db.query(Title).filter(Title.id==title_id).delete()

        if text is not None:
            links = extract_links(text)
        else:
            links = []

        self.update_links(title, links)
        self.update_words(title, text or u'')
Example #3
0
 def _crawl(self):
     print "Getting info for %s" % self.seed
     html = get_html(self.seed)
     self.index = extract_links(html)
Example #4
0
		content = None

		# load the content from the co-sponsor page
		while content == None:
			try:
				content = get_content(source_doc, requests_session)
			except requests.exceptions.ConnectionError as e:
				print e
				print '   Connection failed. Retrying...'
				requests_session = requests.session()
			except Exception as e:
				print e

		# loop over the links extracted from the co-sponsor page
		for link in extract_links(content, bill.co_sponsor_link):

			if 'District' in link['name']:

				# parse the name out of the link text
				parse_link_name = parse_name(link['name'])
				if parse_link_name['success']:

					# find the assembly member record
					# first, query with the parsed last name, the district, the chamber and the assembly
					member_query = (Assembly_Member
										.select()
										.join(Person)
										.where(
											  (Person.last_name == parse_link_name['name_dict']['last_name'])
											& (Assembly_Member.district == parse_link_name['name_dict']['district'])
        content = None

        # load the content from the co-sponsor page
        while content == None:
            try:
                content = get_content(source_doc, requests_session)
            except requests.exceptions.ConnectionError as e:
                print e
                print "   Connection failed. Retrying..."
                requests_session = requests.session()
            except Exception as e:
                print e

                # loop over the links extracted from the co-sponsor page
        for link in extract_links(content, bill.co_sponsor_link):

            if "District" in link["name"]:

                # parse the name out of the link text
                parse_link_name = parse_name(link["name"])
                if parse_link_name["success"]:

                    # find the assembly member record
                    # first, query with the parsed last name, the district, the chamber and the assembly
                    member_query = (
                        Assembly_Member.select()
                        .join(Person)
                        .where(
                            (Person.last_name == parse_link_name["name_dict"]["last_name"])
                            & (Assembly_Member.district == parse_link_name["name_dict"]["district"])