Example #1
0
 def process_item(self, item, spider):
     if not item.get('body'):
         raise DropItem(f'Missing body property in {item}')
     return item
Example #2
0
 def item_completed(self, results, item, info):
     img_paths = [x['path'] for ok, x in results if ok]
     if not img_paths:
         raise DropItem("图片没下载好 %s" % img_paths)
Example #3
0
 def item_completed(self, results, item, info):
     image_paths = [x['path'] for ok, x in results if ok]
     if not image_paths:
         raise DropItem('Image Download Failed')
     return item
Example #4
0
    def process_item(self, item, spider):
        # print(item)
        valid = True
        i = md5(item['status']).hexdigest()
        print(i)
        returndf = self.df.add(i)
        print(returndf)
        if item['url'].find('error') == -1:
            iserror = False
        else:
            iserror = True
        print(iserror)
        if returndf or iserror:
            valid = False
        else:
            for data in item:
                if not data:
                    valid = False
                    raise DropItem("Missing {0}!".format(data))

        print(valid)
        print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
        if valid:
            self.fa.writelines(i + '\r\n')
            self.collection.insert(dict(item))
            #logging.log(msg="Car added to MongoDB database!", level=logging.INFO)
            self.mongocounts += 1
            #logging.log(msg="scrapy                    " + str(self.counts) + "                  items", level=logging.INFO)
            #mysql save
            if settings['MONGODB_COLLECTION'] in [
                    "taoche", "youxin", "ttpai", "che168", "youxinpai",
                    "guazi", "renrenche", "kaixin", "haoche51", "souche",
                    "hx2car", "che168", "renrenche", "iautos", "souhu",
                    "haoche99", "che273", "che101", "chewang", "xcar", "ganji",
                    "zg2sc", "ygche", "che58", "youche", "cn2che", "baixing",
                    "che273_test"
            ]:
                domtext = scrapy.selector.Selector(text=item["datasave"][1])
                parsed_item = car_parse.ILikeParse(self.caritemlist, item,
                                                   domtext)
                self.items.append(parsed_item)
                # logging.log(msg=item["datasave"][0],level=logging.INFO)
                # logging.log(msg=item["datasave"][1],level=logging.INFO)
                # logging.log(msg=parsed_item,level=logging.INFO)
                # logging.log(msg=item['url'],level=logging.INFO)
                # logging.log(msg="add to SQL",level=logging.INFO)
                self.items = self.savedata(self.items, self.table,
                                           self.mysqlconnection, 1)
            elif settings['MONGODB_COLLECTION'] in [
                    "chemao", "aokangda", "auto51", 'aokangda_test',
                    'chezhibao'
            ]:
                domtext = scrapy.selector.Selector(text=item["datasave"][0])
                parsed_item = car_parse.ILikeParse(self.caritemlist, item,
                                                   domtext)
                self.items.append(parsed_item)
                # logging.log(msg=item["datasave"][0],level=logging.INFO)
                #logging.log(msg=item["datasave"][1],level=logging.INFO)
                # logging.log(msg=parsed_item,level=logging.INFO)
                # logging.log(msg=item['url'],level=logging.INFO)
                # logging.log(msg="add to SQL",level=logging.INFO)
                self.items = self.savedata(self.items, self.table,
                                           self.mysqlconnection, 1)

                # logging.log(msg="add sql                   " + str(self.counts) + "                  items",  level=logging.INFO)
        elif iserror:
            logging.log(msg="Car Error!", level=logging.INFO)
            # log save
            urlog = {'url': item['url'], 'grabtime': item['grabtime']}
            self.collectionwrong.insert(urlog)
        else:
            pass
            #logging.log(msg="Car duplicated!", level=logging.INFO)
        #log save
        urlog = {'url': item['url'], 'grabtime': item['grabtime']}
        self.collectionurllog.insert(urlog)
        return item
Example #5
0
 def __init__(self, original_url="", *args):
     self.original_url = original_url
     self.style = color.color_style()
     DropItem.__init__(self, *args)
Example #6
0
 def item_completed(self, results, item, info):
     image_paths = [x['path'] for ok, x in results if ok]
     if not image_paths:
         raise DropItem('图片未下载好 %s' % image_paths)
Example #7
0
    def process_item(self, item, spider):
        if not scan(item):
            raise DropItem('item is incomplete')

        return item
Example #8
0
 def __str__(self):  #####for usage: print e
     print self.style.ERROR("DROP(CrawledUrlDrop):" + self.url)
     return DropItem.__str__(self)
 def process_item(self, item, spider):
     titulo = item['titulo']
     if ('capsula' not in titulo):
         raise DropItem('No tiene capsula')
     else:
         return item
Example #10
0
 def process_item(self, item, spider):
     if item['zid'] in self.zud_seen:
         raise DropItem('Duplicate listing found %s' % item['zid'])
     elif re.search('AuthRequired', item['link']):
         raise DropItem('Unauthorized listing found %s' % item['zid'])
Example #11
0
 def process_item(self, item, spider):
     if self.filter is None or re.search(self.filter, item["url"]):
         return item
     else:
         raise DropItem("Pattern [%s] not in url [%s]" %
                        (self.filter, item["url"]))
Example #12
0
 def process_item(self, item, spider):
     exist_url = self.session.query(Urls).filter_by(url=item["url"]).first()
     if exist_url is not None:  # the current quote exists
         raise DropItem("Duplicate url found: %s" % item["url"])
     else:
         return item
Example #13
0
    def __str__(self):  #####for usage: print e
        print(self.style.ERROR("DROP(NofilesDrop):" + self.original_url))

        return DropItem.__str__(self)
Example #14
0
    def __str__(self):#####for usage: print e
        print self.style.ERROR("DROP(NofilesDrop):" + self.original_url)

        return DropItem.__str__(self)
Example #15
0
 def _title(self, item: Any) -> None:
     if len(item['title']) < 3:
         raise DropItem("Drop item as title '{}' is bad".format(
             item['title']))
     item['title'] = item['title']
Example #16
0
    def __str__(self):#####for usage: print e
        print "DROP(NofilesDrop):" + self.original_url

        return DropItem.__str__(self)
Example #17
0
 def _content(self, item: Any) -> None:
     if len(html2text(item['content'])) < 200:
         raise DropItem("Drop item as content is too short")
     item['content'] = item['content']
Example #18
0
 def __init__(self, info="", url="", *args):
     self.info = info
     self.url = url
     self.style = color.color_style()
     DropItem.__init__(self, *args)
Example #19
0
 def process_item(self,item,spider):
    if redis_db.hexists(redis_data_dict,item['id']):
        raise  DropItem("Duplicate book found:%s" % item)
    return item
Example #20
0
 def process_item(self, item, spider):
     match = re.match("item\?id=[0-9]+", item["url"])
     if match:
         print("Excluded self-post: " + str(item["url"]))
         raise DropItem("Excluded self-post: " + str(item["url"]))
     return item
Example #21
0
 def process_item(self, item, spider):
     if item['title']:
         item["title"] = clean_spaces(item["title"])
         return item
     else:
         raise DropItem("Missing title in %s" % item)
Example #22
0
 def process_item(self, item, spider):
     score_diff = int(item['score_diff'])
     if score_diff > 0 and score_diff < 250:
         return item
     else:
         raise DropItem("score_diff not matched!")
Example #23
0
 def drop_item_by_product_name(self, item):
     filter_types = [
         'boots',
         'tote',
         'crevasse',
         'watch',
         'watches',
         'glove',
         'mittens',
         'backpack',
         'kabyte',
         'kabig',
         'kaban',
         'itinerant',
         'gnomad',
         'crevasse',
         'toter',
         'duffel',
         'access',
         'pack',
         'bag',
         'scrunchie',
         'lanyard',
         'sunglasses',
         'sackpack',
         'shoe',
         'belt',
         'gaiter',
         'kneepad',
         'earphone',
         'skateboard',
         'slides',
         'cleats',
         'spikes',
         'wader',
         'lacrosse',
         'pads',
         'mule',
         'mitts',
         'mitt',
         'goggle',
         'booties',
         'bootie',
         'shawl',
         'blanket',
         'pouch',
         'torque',
         'mat',
         'fastpack',
         'puddle',
         'phone',
         'strap',
         'boot',
         'soccer',
         'benassi',
         'sack',
         'sandals',
         'sneaker',
         'ball',
         'cleat',
         'slide',
         'moc',
         'sandal',
         'cleat',
         'slide',
         'waistbag',
         'canteen',
         'Hair Ties',
         'Water Bottle',
         'spray',
         'lotion',
         'co-wash',
         'shampoo',
         'sunscreen',
         'pouch',
         'conditioner',
         'balm',
     ]
     for t in filter_types:
         if t in item['Name'].lower():
             raise DropItem('found type Name : ', t)
         if t in item['Url'].lower():
             raise DropItem('found type Url : ', t)
         try:
             if t in item['Clothing'].lower():
                 raise DropItem('found type :', t)
         except:
             pass
Example #24
0
 def process_item(self, item, spider):
     if item['id'] in self.ids_seen:
         raise DropItem("Duplicate item found: %s" % item)
     else:
         self.ids_seen.add(item['id'])
         return item
Example #25
0
 def item_completed(self, results, item, info):
     thumbnail_url = [x['path'] for ok, x in results if ok]
     if not thumbnail_url:
         raise DropItem("Item contains no images")
     item['thumbnail_url'] = thumbnail_url
     return item
Example #26
0
 def process_item(self, item, spider):
     if item['url'] in self.url_seen:
         raise DropItem(f'Item already visited {item}')
     else:
         self.url_seen.add(item['url'])
     return item
Example #27
0
    def process_item(self, item, spider):
        # START CLEANUP
        fields = [
            'title', 'geek_rating', 'min_age', 'votes', 'min_players',
            'max_players', 'weight', 'avg_rating'
        ]
        try:
            for field in fields:
                if field not in item or not item[field]:
                    item[field] = 0
                else:
                    item[field] = item[field].strip()
            if item['time'] == DOUBLE_EN_DASH or item['time'] is None:
                item['time'] = 0
            else:
                item['time'] = item['time'].strip()

            if 'mechanisms' in item:
                item['mechanisms'] = {k.strip() for k in item['mechanisms']}
            else:
                item['mechanisms'] = 0
        except AttributeError as err:
            raise DropItem(f'INFO: Dropping {item["title"]}, unhandled field.')
        # END CLEANUP

        # START PROCESSING
        if item['txt_cnt']:
            item['txt_cnt'] = int(re.findall('\d+', item['txt_cnt'])[0])
        else:
            item['txt_cnt'] = 0
        if item['vid_cnt']:
            item['vid_cnt'] = int(re.findall('\d+', item['vid_cnt'])[0])
        else:
            item['vid_cnt'] = 0
        item['review_count'] = item['txt_cnt'] + item['vid_cnt']
        if item['min_age'] == DOUBLE_EN_DASH:
            item['min_age'] = 0
        else:
            item['min_age'] = item['min_age'][:-1]
        for field in fields:
            if item[field] == 'N/A':
                item[field] = 0
        # END PROCESSING

        # START FILTERING
        if item['avg_rating'] != 0:
            int_fields = [
                'min_age', 'time', 'votes', 'min_players', 'max_players'
            ]
            float_fields = ['weight', 'avg_rating', 'geek_rating']
            for field in int_fields:
                item[field] = int(item[field])
            for field in float_fields:
                item[field] = float(item[field])

            # remove unwanted fields
            item.pop('txt_cnt', None)
            item.pop('vid_cnt', None)
            print(f'PROCESSED: {item["bg_id"]}, {item["title"]}')
            return item
        else:
            raise DropItem(
                f'BG: {item["title"]}, doesn\'t have enough info. "avg_rating: {item["avg_rating"]}"'
            )
Example #28
0
 def item_completed(self, results, item, info):
     image_paths = [x['path'] for ok, x in results if ok]
     if not image_paths:
         raise DropItem("Item contains no images")
     item['image_paths'] = image_paths
     return item
Example #29
0
 def process_item(self, item, spider):
     if item[self.key] not in self.records:
         return item
     else:
         raise DropItem('Duplicate %s: %s' % (self.key, item[self.key]))
Example #30
0
 def __init__(self, original_url="", *args):
     self.original_url = original_url
     self.style = color.color_style()
     DropItem.__init__(self, *args)
    def process_item(self, item, spider):
        response = item['resp']
        item = vuln()

        xss_type = response.meta['type']
        orig_url = response.meta['orig_url']
        injections = response.meta['injections']
        quote_enclosure = response.meta['quote']
        inj_point = response.meta['inj_point']
        resp_url = response.url
        body = response.body
        # Regex: ( ) mean group 1 is within the parens, . means any char,
        # {1,50} means match any char 1 to 50 times
        #chars_between_delims = '%s(.{1,50}?)%s' % (self.test_str, self.test_str)
        chars_between_delims = '%s(.{0,50}?)%s' % (self.test_str,
                                                   self.test_str)
        inj_num = len(injections)
        mismatch = False
        if xss_type == 'form':
            POST_to = response.meta['POST_to']
        else:
            POST_to = None
        orig_payload = response.meta['payload'].strip(
            self.test_str)  # xss char payload
        escaped_payload = self.unescape_payload(orig_payload)

        break_tag_chars = set(['>', '<', '(', ')'])
        break_attr_chars = set([quote_enclosure, '(', ')'])
        break_js_chars = set(['"', "'", '(', ')'])

        matches = re.findall(chars_between_delims, body)
        if matches:
            xss_num = len(matches)

            if xss_num != inj_num:
                err = (
                    'Mismatch between harmless injection count and payloaded injection count: %d vs %d, increased chance of false positive'
                    % (inj_num, xss_num))
                item['error'] = err

            for idx, match in enumerate(matches):
                unfiltered_chars = self.get_unfiltered_chars(
                    match, escaped_payload)
                if unfiltered_chars:
                    try:
                        line, tag, attr, attr_val = spider.parse_injections(
                            injections[idx])
                    except IndexError:
                        mismatch = True
                        # Mismatch in num of test injections and num of payloads found
                        line, tag, attr, attr_val = 'Unknown', 'Unknown', None, None

                    joined_chars = ''.join(unfiltered_chars)
                    chars = set(joined_chars)
                    line_html = self.get_inj_line(body, match)

                    ###### XSS RULES ########

                    # If there's more XSS matches than harmless injections, we still want to check for the most dangerous characters
                    # May see some false positives here, but better than false negatives
                    if mismatch == True:
                        if '>' in escaped_payload and '<' in escaped_payload:
                            if '<' in joined_chars and '>' in joined_chars:
                                item = self.make_item(joined_chars, xss_type,
                                                      orig_payload, tag,
                                                      orig_url, inj_point,
                                                      line_html, POST_to, item)
                                item = self.url_item_filtering(item, spider)
                                return item

                    # Redirect
                    if 'javascript:prompt(99)' == joined_chars.lower(
                    ):  # redir
                        item = self.make_item(joined_chars, xss_type,
                                              orig_payload, tag, orig_url,
                                              inj_point, line_html, POST_to,
                                              item)
                        item = self.url_item_filtering(item, spider)
                        return item

                    # JS breakout
                    if self.js_pld == escaped_payload:  #js chars
                        if break_js_chars.issubset(chars):
                            item = self.make_item(joined_chars, xss_type,
                                                  orig_payload, tag, orig_url,
                                                  inj_point, line_html,
                                                  POST_to, item)
                            item = self.url_item_filtering(item, spider)
                            return item

                    # Attribute breakout
                    if attr:
                        if quote_enclosure in escaped_payload:
                            if break_attr_chars.issubset(chars):
                                item = self.make_item(joined_chars, xss_type,
                                                      orig_payload, tag,
                                                      orig_url, inj_point,
                                                      line_html, POST_to, item)
                                item = self.url_item_filtering(item, spider)
                                return item

                    # Tag breakout
                    else:
                        if '<' and '>' in escaped_payload:
                            if break_tag_chars.issubset(chars):
                                item = self.make_item(joined_chars, xss_type,
                                                      orig_payload, tag,
                                                      orig_url, inj_point,
                                                      line_html, POST_to, item)
                                item = self.url_item_filtering(item, spider)
                                return item

        # Check the entire body for exact match
        # Escape out all the special regex characters to search for the payload in the html body
        re_payload = escaped_payload.replace('(', '\(').replace(
            ')', '\)').replace('"', '\\"').replace("'", "\\'")
        re_payload = re_payload.replace('{', '\{').replace('}', '\}').replace(
            ']', '\]').replace('[', '\[')
        re_payload = '.{1}?' + re_payload
        full_matches = re.findall(re_payload, body)
        for f in full_matches:
            unescaped_match = ''.join(
                self.get_unfiltered_chars(f, escaped_payload))
            if unescaped_match == escaped_payload:
                #if '\\' == unescaped_match[0]:
                #    continue
                item[
                    'error'] = 'Response passed injection point specific search without success, checked for exact payload match in body (higher chance of false positive here)'
                item['line'] = self.get_inj_line(body, f)
                item['xss_payload'] = orig_payload
                item['unfiltered'] = escaped_payload
                item['inj_point'] = inj_point
                item['xss_type'] = xss_type
                item['url'] = orig_url
                if POST_to:
                    item['POST_to'] = POST_to
                return item

        # In case it slips by all of the filters, then we move on
        raise DropItem(
            'No XSS vulns in %s. Tested: type = %s, injection point = %s' %
            (resp_url, xss_type, inj_point))
Example #32
0
 def __init__(self, original_url="", *args):
     self.original_url = original_url
     DropItem.__init__(self, *args)
Example #33
0
 def update(self, collection, item):
     try:
         collection.insert(dict(item))
         return item
     except:
         raise DropItem('Item already exists.')
Example #34
0
    def __str__(self):  #####for usage: print e
        print self.style.ERROR("DROP(NoTitleDrop):" + self.url)

        return DropItem.__str__(self)
Example #35
0
    def from_crawler(cls, crawler):

        if not crawler.settings.get('MYSQL_SETTINGS'):
            raise DropItem("缺少MySQL的配置")

        return cls(mysql_settings=settings.DATABASES.get('default'), )
Example #36
0
 def __str__(self):  #####for usage: print e
     print self.style.ERROR("DROP(NotContentPageDrop):" + self.info + '|' + self.url)
     return DropItem.__str__(self)
Example #37
0
    def process_item(self, item, spider):
        collection_name = self.stats.get_value("collection_name")
        where = {}
        if item.get("list_id"):
            item["list_id"]  =  ObjectId(item["list_id"])
        fileid  = item.get("fileid")
        biz_id  = item.get("biz_id")
        content_url = item.get("content_url")
        item = dict(item)
        item.update({
            "updateAt": datetime.utcnow()
        })
        '''
        对文章列表和文章详情表进行处理
        '''

        if collection_name   in collections:
            if collection_name == "article.detail":
                where = {
                    "$or":[
                        {
                            "content_url":content_url
                        },
                    ]
                }
                if item.get("content"):
                    content = item.get("content")
                    item["content"] = content.replace("data-src", "src")
                img_paths = item.get("image_paths")  #修改内容图片地址为下载好的地址
                if img_paths is not None  and item.get("biz_id") is not None:
                    try:                       
                        img_replace_urls = Selector(text = item.get("content")).css("img::attr(src)").extract()
                    except Exception as e:
                        logger.error("没有找到图片")
                    else:   
                        for url in img_replace_urls:
                            temp_url = handle_img_urls(url)
                            sha1_url = img_uuid(temp_url)
                            path_url  =  item.get("biz_id")+"/"+sha1_url+".jpg"
                            if path_url in img_paths:
                                path_url =  BASE_URL + "upload/" + path_url
                                item["content"]  = item["content"] .replace(url, path_url)

                
                
            elif collection_name == "article.list":
                where = {
                    "$and":[
                        {
                            "fileid": fileid,         
                        },
                        {
                            "biz_id": biz_id
                        }
                    ]
                }
                if item.get("image_paths") and item.get("cover") and len(item.get("image_paths")) > 0:
                    item["cover"] =  BASE_URL + "upload/" + item.get("image_paths")[0]
                    
        else:
            raise DropItem("不存在的表")                    #不会进行处理
        
        self.db[collection_name].update_one(where, {"$set":item}, upsert=True)
        # old_data = self.db[collection_name].find_one(where, {"_id":1})
        # if not old_data:
        #     # self.db[collection_name].update_one(where, {"$set":item}, upsert=True)
        #     self.db[collection_name].insert(item)
        # else:
        #     if item.get("image_paths"):
        #         self.db.update_one(where, {"$set":item} )
        return item
Example #38
0
 def __str__(self):  #####for usage: print e
     print self.style.ERROR("DROP(KeywordNotFitDrop):" + self.info + '|' + self.url)
     return DropItem.__str__(self)
    def process_item(self, item, spider):
        # UNQUINESS CHECK: URL
        # =====================================================================
        if item['url'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['url'])

        # CLEAN THE LOCATION
        # =====================================================================

        # There  types either location w/reuters, no location w/ reuters, breakingviews w/ location, or location no reuters

        if 'Breakingviews' not in item['location'][0] and '(Reuters)' in item[
                'location'][0]:
            item['location'] = item['location'][0][:item['location'][0].
                                                   find('(Reuters)')]

        elif 'Breakingviews' in item['location'][0]:
            item['location'] = item['location'][
                0][:item['location'][0].find('(Reuters Breakingviews)')]

        elif '(Reuters)' not in item['location'][0]:
            item['location'] = item['location'][0][:item['location'][0].
                                                   find('-')]

        # CLEAN THE DATE
        # =====================================================================
        # I believe this is the posting timezone, but haven't verified
        local = pytz.timezone('America/New_York')

        date = item['published_date'][0][:(
            item['published_date'][0].rfind('/'))]
        date = str(date.replace('/', ''))
        date = parse(date)
        local_dt = local.localize(date, is_dst=None)
        utc_dt = local_dt.astimezone(pytz.utc)
        item['published_date'] = utc_dt

        # CLEAN THE PARAGRAPHS
        # =====================================================================
        item['paragraphs'][0] = item['paragraphs'][0][(
            item['paragraphs'][0].find('(Reuters) -') + 12):]
        replace_paragraph = []
        for value in item['paragraphs']:
            # Removes this goddamn block element '▒'
            value = value.encode('utf-8').decode('unicode_escape').encode(
                'ascii', 'ignore').decode('utf-8')
            replace_paragraph.append(value)
        item['paragraphs'] = replace_paragraph
        # Not all Articles Have Authors on Reuters
        if len(item['author']) == 0:
            item['author'] = ['None']

        #  DATABASE INSERT
        # =====================================================================
        # print('author type ' + str(type(item['author'])))
        # print('paragraphs type ' + str(type(item['paragraphs'])))
        # print('subject type ' + str(type(item['subject'])))
        # print('title type ' + str(type(item['title'])))
        # print('url type ' + str(type(item['url'])))
        # print('locaiton type ' + str(type(item['location'])))
        # print('additional_authors type ' + str(type(item['additional_authors'])))
        # print('date type ' + str(type(item['published_date'])))

        self.cur.execute(
            "INSERT INTO reuters_daily (author, paragraphs, published_date, subject, location, additional_authors, url, title) VALUES( %s, %s, %s, %s, %s, %s, %s, %s)",
            (item['author'], item['paragraphs'], item['published_date'],
             item['subject'], item['location'], item['additional_authors'],
             item['url'], item['title']))
        self.connection.commit()
        return item