Esempio n. 1
0
def feature_extraction(csvfile,
                       datapath,
                       resultcsv,
                       fe_path=config.path_fe_space):
    fe = FeatureExtract(fe_path)
    fe.print_featuremap()

    # feature extraction
    out = open(resultcsv, "w")
    line = "%s,%s,%s" % ('id', 'label', fe.str_featuremap_line())
    # print line
    out.write(line + "\n")
    out.flush()

    with open(csvfile, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = True
        for row in reader:
            if not header:
                item = UrlItem()
                ct = open(datapath + "/" + row[0], "r").read()
                item['url'] = row[1]
                item['content'] = ct
                item['title'] = item.get_short_title()
                f = fe.extract_item(item)
                line = "%s,%s,%s" % (row[0], row[2],
                                     FeatureExtract.str_feature(f))
                print line
                out.write(line + "\n")
                out.flush()
            else:
                header = False
        out.close()
Esempio n. 2
0
def feature_extraction(csvfile, datapath, resultcsv,fe_path=config.path_fe_space):
    fe = FeatureExtract(fe_path)
    fe.print_featuremap()

    # feature extraction
    out = open(resultcsv, "w")
    line = "%s,%s,%s" % ('id', 'label', fe.str_featuremap_line())
    # print line
    out.write(line+"\n")
    out.flush()

    with open(csvfile, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = True
        for row in reader:
            if not header:
                item = UrlItem()
                ct = open(datapath + "/" + row[0], "r").read()
                item['url'] = row[1]
                item['content'] = ct
                item['title'] = item.get_short_title()
                f = fe.extract_item(item)
                line = "%s,%s,%s" % (row[0], row[2], FeatureExtract.str_feature(f))
                print line
                out.write(line+"\n")
                out.flush()
            else:
                header = False
        out.close()
Esempio n. 3
0
 def parse_item(response):
     # response are all updated or new
     # db not changed
     # is this url in URL_LIB
     item = UrlItem.load_with_content(url=response.url, response=response)
     logging.info("PC get page [%s]:- %s" % (item['id'], item['url']))
     yield item
Esempio n. 4
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_judge_inbox)
        feature = self.__fe.extract_item(item)

        if 'decision' not in data_loaded.keys():
            decision, confidence = self.__auto_judge(feature)
            log.info("[%s]: [%s] # %s # %s%%" % (item_id, FeatureExtract.str_feature(feature), decision, confidence))
        else:
            decision, confidence = data_loaded['decision'],100
            log.info("[%s]: back from Extractor # %s # %s%%" % (item_id, decision, confidence))
            self.__relearn_clf(feature,decision)

        if confidence > config.const_CONFIDENCE_THRESHOLD:
            item['is_target'] = decision
            item.save()
            if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]:
                self.__send_to_extractor(item)
            else:
                os.remove(config.path_judge_inbox + "/%s" % item.filename())
        else:
            item['is_target'] = config.const_IS_TARGET_UNKNOW
            item.save()

            self.__judge_queue[item_id] = {
                "title": item['title'],
                "url": item['url'],
                "filename": item.filename(),
                "confidence": round(confidence,2),
                "decision": decision,
                "feature": feature
            }
        pass
Esempio n. 5
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item,extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass
Esempio n. 6
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item, extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass
Esempio n. 7
0
 def parse_item(response):
     # response are all updated or new
     # db not changed
     # is this url in URL_LIB
     item = UrlItem.load_with_content(url=response.url, response=response)
     logging.info("PC get page [%s]:- %s" % (item['id'], item['url']))
     yield item
Esempio n. 8
0
 def __op_rejudge_done(self, data_loaded, connection):
     item_id = int(data_loaded['id'])
     decision = int(data_loaded['decision'])
     item = UrlItem.load(id=item_id)
     del self.__ext_queue[item_id]
     self.__send_back_to_judge(item, decision)
     tool.send_msg(connection, "0")
     pass
Esempio n. 9
0
 def __op_rejudge_done(self, data_loaded, connection):
     item_id = int(data_loaded['id'])
     decision = int(data_loaded['decision'])
     item = UrlItem.load(id=item_id)
     del self.__ext_queue[item_id]
     self.__send_back_to_judge(item, decision)
     tool.send_msg(connection, "0")
     pass
Esempio n. 10
0
    def __op_add_extractor(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        extractor = data_loaded['extractor']

        item = UrlItem.load_with_content(item_id,
                                         file_path=config.path_extractor_inbox)
        self.__extract(item, extractor)
        del self.__ext_queue[item['id']]
        tool.send_msg(connection, "0")
        pass
Esempio n. 11
0
 def __op_test_rule(self, data_loaded, connection):
     item_id = int(data_loaded['id'])
     rule = data_loaded['rule']
     attrid = int(data_loaded['attrid'])
     item = UrlItem.load_with_content(id=item_id,
                                      file_path=config.path_extractor_inbox)
     tool.send_msg(
         connection,
         self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid))
     pass
Esempio n. 12
0
 def __op_test_rule(self, data_loaded, connection):
     item_id = int(data_loaded['id'])
     rule = data_loaded['rule']
     attrid = int(data_loaded['attrid'])
     item = UrlItem.load_with_content(
         id=item_id, file_path=config.path_extractor_inbox)
     tool.send_msg(
         connection,
         self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid)
     )
     pass
Esempio n. 13
0
    def __op_add_extractor(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        extractor = data_loaded['extractor']

        item = UrlItem.load_with_content(
            item_id, file_path=config.path_extractor_inbox)
        self.__extract(item,extractor)
        del self.__ext_queue[item['id']]
        tool.send_msg(
            connection,
            "0"
        )
        pass
Esempio n. 14
0
 def __op_preview(self, data_loaded, connection):
     log.info(data_loaded['extractor'])
     if data_loaded['extractor'] == config.const_RULE_UNKNOW:
         result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)}
     else:
         item_id = int(data_loaded['id'])
         item = UrlItem.load_with_content(
             item_id, file_path=config.path_extractor_inbox)
         extractor = data_loaded['extractor']
         result = self.__ie.extract(item, extractor)
     preview = list()
     for att, str in result.iteritems():
         preview.insert(att, dict(name=self.__ie.name(att), value=str))
     log.info(preview)
     tool.send_msg(connection, pickle.dumps(preview, -1))
     pass
Esempio n. 15
0
 def __op_preview(self, data_loaded, connection):
     log.info(data_loaded['extractor'])
     if data_loaded['extractor'] == config.const_RULE_UNKNOW:
         result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)}
     else:
         item_id = int(data_loaded['id'])
         item = UrlItem.load_with_content(
             item_id, file_path=config.path_extractor_inbox)
         extractor = data_loaded['extractor']
         result = self.__ie.extract(item, extractor)
     preview = list()
     for att, str in result.iteritems():
         preview.insert(att, dict(name=self.__ie.name(att), value=str))
     log.info(preview)
     tool.send_msg(connection, pickle.dumps(preview, -1))
     pass
Esempio n. 16
0
 def __refresh_list(self):
     delete_ids = []
     for key, ent in self.__judge_queue.iteritems():
         decision, confidence = self.__auto_judge(ent['feature'])
         if confidence > config.const_CONFIDENCE_THRESHOLD:
             item = UrlItem.load(id=key)
             item['is_target'] = decision
             item.save()
             delete_ids.append(key)
             if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]:
                 self.__send_to_extractor(item)
             else:
                 os.remove(config.path_judge_inbox + "/%s" % ent['filename'])
         else:
             self.__judge_queue[key]['confidence'] = confidence
             self.__judge_queue[key]['decision'] = decision
     for ent_id in delete_ids:
         del self.__judge_queue[ent_id]
Esempio n. 17
0
    def __op_done(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        decision = int(data_loaded['decision'])

        item = UrlItem.load(id=item_id)
        item['is_target'] = decision
        item.save()

        if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]:
            self.__send_to_extractor(item)
        else:
            os.remove(config.path_judge_inbox + "/%s" % item.filename())

        self.__relearn_clf(self.__judge_queue[item_id]['feature'],decision)

        del self.__judge_queue[item_id]
        tool.send_msg(connection, "0")
        pass
Esempio n. 18
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_judge_inbox)
        feature = self.__fe.extract_item(item)

        if 'decision' not in data_loaded.keys():
            decision, confidence = self.__auto_judge(feature)
            log.info("[%s]: [%s] # %s # %s%%" %
                     (item_id, FeatureExtract.str_feature(feature), decision,
                      confidence))
        else:
            decision, confidence = data_loaded['decision'], 100
            log.info("[%s]: back from Extractor # %s # %s%%" %
                     (item_id, decision, confidence))
            self.__relearn_clf(feature, decision)

        if confidence > config.const_CONFIDENCE_THRESHOLD:
            # pretty sure, save to db, and pass to extract
            item['is_target'] = decision
            item.save()
            if int(item['is_target']) in [
                    config.const_IS_TARGET_MULTIPLE,
                    config.const_IS_TARGET_SIGNLE
            ]:
                # item is target
                self.__send_to_extractor(item)
            else:
                # item is not target
                os.remove(config.path_judge_inbox + "/%s" % item.filename())
        else:
            # not sure, put it in queue, involving human-being
            item['is_target'] = config.const_IS_TARGET_UNKNOW
            item.save()

            self.__judge_queue[item_id] = {
                "title": item['title'],
                "url": item['url'],
                "filename": item.filename(),
                "confidence": round(confidence, 2),
                "decision": decision,
                "feature": feature
            }
        pass
Esempio n. 19
0
 def __op_refresh(self, data_loaded, connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         item = UrlItem.load_with_content(
             id=item_id, file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item, extractor)
                 delete_ids.append(item_id)
     # clear delete_ids
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]
Esempio n. 20
0
    def __op_done(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        decision = int(data_loaded['decision'])

        item = UrlItem.load(id=item_id)
        item['is_target'] = decision
        item.save()

        if int(item['is_target']) in [
                config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE
        ]:
            # item is target
            self.__send_to_extractor(item)
        else:
            # item is not target
            os.remove(config.path_judge_inbox + "/%s" % item.filename())

        self.__relearn_clf(self.__judge_queue[item_id]['feature'], decision)

        del self.__judge_queue[item_id]
        tool.send_msg(connection, "0")
        pass
Esempio n. 21
0
 def __op_refresh(self,data_loaded,connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         if not os.path.isfile(
                 config.path_extractor_inbox+"/"+str(item_id)+".html"):
             delete_ids.append(item_id)
             continue
         item = UrlItem.load_with_content(
             id=item_id,file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item,extractor)
                 delete_ids.append(item_id)
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]
Esempio n. 22
0
 def __refresh_list(self):
     delete_ids = []
     for key, ent in self.__judge_queue.iteritems():
         decision, confidence = self.__auto_judge(ent['feature'])
         if confidence > config.const_CONFIDENCE_THRESHOLD:
             # pretty sure, save to db, and pass to extract
             item = UrlItem.load(id=key)
             item['is_target'] = decision
             item.save()
             delete_ids.append(key)
             if int(item['is_target']) in [
                     config.const_IS_TARGET_MULTIPLE,
                     config.const_IS_TARGET_SIGNLE
             ]:
                 self.__send_to_extractor(item)
             else:
                 os.remove(config.path_judge_inbox +
                           "/%s" % ent['filename'])
         else:
             self.__judge_queue[key]['confidence'] = confidence
             self.__judge_queue[key]['decision'] = decision
     # clear delete_ids
     for ent_id in delete_ids:
         del self.__judge_queue[ent_id]
Esempio n. 23
0
__author__ = 'LeoDong'
import requests
from util import config
from bs4 import BeautifulSoup
from extractor.InfoExtractor import InfoExtractor
from SAECrawlers.items import UrlItem
import json


ie = InfoExtractor(config.path_extract_onto+"/seminar.xml",config.path_extract_onto)

item = UrlItem.load_with_content(id=1,file_path=config.path_judge_inbox)
# print item.get_part('soup').prettify()
# print item.get_part('content')
rule={
    "on": "content",
    "scope": {
        "sel":"section#visible-body .logo",
        "target":"text"
    },
    "description": "url",
    "actions": [
        2
    ],
    "substring": {
        "after": "H",
        "before": ""
    },
}

print ie.extract_attr(item,rule_id_or_dict=rule)
Esempio n. 24
0
__author__ = 'LeoDong'
import requests
from util import config
from bs4 import BeautifulSoup
from extractor.InfoExtractor import InfoExtractor
from SAECrawlers.items import UrlItem
import json

ie = InfoExtractor(config.path_extract_onto + "/seminar.xml",
                   config.path_extract_onto)

item = UrlItem.load_with_content(id=1, file_path=config.path_judge_inbox)
# print item.get_part('soup').prettify()
# print item.get_part('content')
rule = {
    "on": "content",
    "scope": {
        "sel": "section#visible-body .logo",
        "target": "text"
    },
    "description": "url",
    "actions": [2],
    "substring": {
        "after": "H",
        "before": ""
    },
}

print ie.extract_attr(item, rule_id_or_dict=rule)

# print json.dumps(ie.map(1), indent=2)
Esempio n. 25
0
 def parse(self, response):
     item = UrlItem.load_with_content(url=response.url, response=response)
     logging.debug("Updater get page [%s]:- %s" % (item['id'], item['url']))
     yield item