def parse_item(self, response):

        # write downloaded body to temp file
        file = tempfile.NamedTemporaryFile(mode='w+b',
                                           delete=False,
                                           prefix="etl_web_crawl_")
        file.write(response.body)
        filename = file.name
        file.close()

        self.logger.info('Adding ETL task for downloaded page or file from %s',
                         response.url)

        downloaded_headers = {}
        if 'date' in response.headers:
            downloaded_headers['date'] = response.headers['date'].decode(
                "utf-8", errors="ignore")
        if 'last-modified' in response.headers:
            downloaded_headers['last-modified'] = response.headers[
                'last-modified'].decode("utf-8", errors="ignore")

        # add task to index the downloaded file/page by ETL web in Celery task worker
        index_web.apply_async(kwargs={
            'uri': response.url,
            'downloaded_file': filename,
            'downloaded_headers': downloaded_headers
        },
                              queue='tasks',
                              priority=5)
def index(uri, crawler_type="PATH"):

    configfile = '/etc/opensemanticsearch/connector-web'

    # read config file
    config = {}
    exec(open(configfile).read(), locals())

    name = "Open Semantic ETL {}".format(uri)

    start_urls = [uri]

    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

    if crawler_type == "PATH":
        # crawl only the path
        filter_regex = re.escape(uri) + '*'
        rules = (Rule(LinkExtractor(
            allow=filter_regex,
            deny_extensions=config['webcrawler_deny_extensions']),
                      callback='parse_item'), )
        process.crawl(OpenSemanticETL_Spider,
                      start_urls=start_urls,
                      rules=rules,
                      name=name)

    else:
        # crawl full domain and subdomains

        allowed_domain = uri
        # remove protocol prefix
        if allowed_domain.lower().startswith('http://www.'):
            allowed_domain = allowed_domain[11:]
        elif allowed_domain.lower().startswith('https://www.'):
            allowed_domain = allowed_domain[12:]
        elif allowed_domain.lower().startswith('http://'):
            allowed_domain = allowed_domain[7:]
        elif allowed_domain.lower().startswith('https://'):
            allowed_domain = allowed_domain[8:]

        # get only domain name without path
        allowed_domain = allowed_domain.split("/")[0]

        rules = (Rule(LinkExtractor(
            deny_extensions=config['webcrawler_deny_extensions']),
                      callback='parse_item'), )
        process.crawl(OpenSemanticETL_Spider,
                      start_urls=start_urls,
                      allowed_domains=[allowed_domain],
                      rules=rules,
                      name=name)

    # the start URL itselves shall be indexed, too, so add task to index the downloaded file/page by ETL web in Celery task worker
    index_web.apply_async(kwargs={'uri': uri},
                          queue='open_semantic_etl_tasks',
                          priority=5)

    process.start(
    )  # the script will block here until the crawling is finished
def index_tweet(obj, config):
    tweet = obj.__dict__

    parameters = {}
    parameters['id'] = tweet['link']

    data = {}
    data['content_type_ss'] = 'Tweet'
    data['content_type_group_ss'] = 'Social media post'

    data['author_ss'] = tweet['name']
    data['userid_s'] = tweet['user_id_str']
    data['username_ss'] = tweet['username']

    data['title_txt'] = tweet['tweet']
    data['content_txt'] = tweet['tweet']

    data['hashtag_ss'] = tweet['hashtags']

    if tweet['place']:
        data['location_ss'] = tweet['place']

    data['urls_ss'] = tweet['urls']

    data['mentions_ss'] = tweet['mentions']

    data['retweets_count_i'] = tweet['retweets_count']
    data['likes_count_i'] = tweet['likes_count']
    data['replies_count_i'] = tweet['replies_count']
    data['file_modified_dt'] = tweet['datestamp'] + 'T' + tweet[
        'timestamp'] + 'Z'

    if config.Index_Linked_Webpages:
        if data['urls_ss']:
            for url in data['urls_ss']:
                index_web.apply_async(kwargs={'uri': url},
                                      queue='tasks',
                                      priority=5)

    try:
        etl.process(parameters, data)
    except BaseException as e:
        sys.stderr.write("Exception while indexing tweet {} : {}".format(
            parameters['id'], e))