def parse_item(self, response): # write downloaded body to temp file file = tempfile.NamedTemporaryFile(mode='w+b', delete=False, prefix="etl_web_crawl_") file.write(response.body) filename = file.name file.close() self.logger.info('Adding ETL task for downloaded page or file from %s', response.url) downloaded_headers = {} if 'date' in response.headers: downloaded_headers['date'] = response.headers['date'].decode( "utf-8", errors="ignore") if 'last-modified' in response.headers: downloaded_headers['last-modified'] = response.headers[ 'last-modified'].decode("utf-8", errors="ignore") # add task to index the downloaded file/page by ETL web in Celery task worker index_web.apply_async(kwargs={ 'uri': response.url, 'downloaded_file': filename, 'downloaded_headers': downloaded_headers }, queue='tasks', priority=5)
def index(uri, crawler_type="PATH"): configfile = '/etc/opensemanticsearch/connector-web' # read config file config = {} exec(open(configfile).read(), locals()) name = "Open Semantic ETL {}".format(uri) start_urls = [uri] process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) if crawler_type == "PATH": # crawl only the path filter_regex = re.escape(uri) + '*' rules = (Rule(LinkExtractor( allow=filter_regex, deny_extensions=config['webcrawler_deny_extensions']), callback='parse_item'), ) process.crawl(OpenSemanticETL_Spider, start_urls=start_urls, rules=rules, name=name) else: # crawl full domain and subdomains allowed_domain = uri # remove protocol prefix if allowed_domain.lower().startswith('http://www.'): allowed_domain = allowed_domain[11:] elif allowed_domain.lower().startswith('https://www.'): allowed_domain = allowed_domain[12:] elif allowed_domain.lower().startswith('http://'): allowed_domain = allowed_domain[7:] elif allowed_domain.lower().startswith('https://'): allowed_domain = allowed_domain[8:] # get only domain name without path allowed_domain = allowed_domain.split("/")[0] rules = (Rule(LinkExtractor( deny_extensions=config['webcrawler_deny_extensions']), callback='parse_item'), ) process.crawl(OpenSemanticETL_Spider, start_urls=start_urls, allowed_domains=[allowed_domain], rules=rules, name=name) # the start URL itselves shall be indexed, too, so add task to index the downloaded file/page by ETL web in Celery task worker index_web.apply_async(kwargs={'uri': uri}, queue='open_semantic_etl_tasks', priority=5) process.start( ) # the script will block here until the crawling is finished
def index_tweet(obj, config): tweet = obj.__dict__ parameters = {} parameters['id'] = tweet['link'] data = {} data['content_type_ss'] = 'Tweet' data['content_type_group_ss'] = 'Social media post' data['author_ss'] = tweet['name'] data['userid_s'] = tweet['user_id_str'] data['username_ss'] = tweet['username'] data['title_txt'] = tweet['tweet'] data['content_txt'] = tweet['tweet'] data['hashtag_ss'] = tweet['hashtags'] if tweet['place']: data['location_ss'] = tweet['place'] data['urls_ss'] = tweet['urls'] data['mentions_ss'] = tweet['mentions'] data['retweets_count_i'] = tweet['retweets_count'] data['likes_count_i'] = tweet['likes_count'] data['replies_count_i'] = tweet['replies_count'] data['file_modified_dt'] = tweet['datestamp'] + 'T' + tweet[ 'timestamp'] + 'Z' if config.Index_Linked_Webpages: if data['urls_ss']: for url in data['urls_ss']: index_web.apply_async(kwargs={'uri': url}, queue='tasks', priority=5) try: etl.process(parameters, data) except BaseException as e: sys.stderr.write("Exception while indexing tweet {} : {}".format( parameters['id'], e))