Ejemplo n.º 1
0
    def reduce_process(self):
        workers_folder = [os.path.join(self.main_folder, folder) for folder in os.listdir(self.main_folder)]

        sc.message("Processing files")
        print(workers_folder)

        reduced_folder = sc.check_folder(os.path.join(self.output_folder, "ner_encoded"))
        train_folder = sc.check_folder(os.path.join(reduced_folder, "train"))
        test_folder = sc.check_folder(os.path.join(reduced_folder, "test"))
        total_ads = 0

        # Count lines
        for dataset_folder in workers_folder:
            for line in open(os.path.join(dataset_folder, "dataset.jsonl"), "r", encoding="utf-8"):
                total_ads += 1

        test_size = int(total_ads * self.test_perc)
        train_size = total_ads - test_size
        total_ads = 0

        # Save train/test set
        for dataset_folder in workers_folder:
            for line in open(os.path.join(dataset_folder, "dataset.jsonl"), "r", encoding="utf-8"):
                save_folder = train_folder if total_ads < train_size else test_folder
                with open(os.path.join(save_folder, "dataset.jsonl"), "a", encoding="utf-8") as js:
                    js.write(line)
                    total_ads += 1

        # Copy maps
        maps_path = [os.path.join(workers_folder[0], file_name) for file_name in os.listdir(workers_folder[0]) if "dataset" not in file_name]
        for map in maps_path:
            copy(map, reduced_folder)

        sc.message("DONE! Save @{}".format(reduced_folder))
Ejemplo n.º 2
0
def base(files: List[str], encoder: "pipelines.encoder.BaseEncoder" = None):
    """
    Base pipeline method to be used @pararell_processing
    :param files: file paths to be processed
    :param encoder: encoder class defined @config.gin
    """
    try:
        if not encoder:
            raise ValueError(
                "Encoder cannot be None. PLz Specificy a encoder @gin.config!")

        # Initialization
        generator = ds.build_advertise_generator(files)
        parser = Parser()
        enc_client = encoder()

        # Source generator, clean and encode advertise
        for line in generator:
            try:
                ad = dc.clean_raw_advertise(line, parser)
                if ad:
                    enc_client.encode_advertise(ad)
            except Exception as err:
                sc.message(err)

        enc_client.save_maps()

    except Exception as erro:
        sc.message(erro)
Ejemplo n.º 3
0
 def save_maps(self, *maps):
     sc.message("Saving Maps...")
     if self.model_folder:
         for k, map in self.maps.items():
             # TODO: may get too big, write as txt
             sc.save_dict_2json(os.path.join(self.model_folder, "{}.json".format(k)), map)
     else:
         # TODO: implement saving in DataStorage
         raise NotImplementedError("Saving outside a local folder path is not implemented yet!")
Ejemplo n.º 4
0
    def cutoff_schema(self, schema_counter: Dict[str, int],
                      ctoff: int) -> List[str]:
        tmp_vec = []
        for k, v in schema_counter.items():
            if v > ctoff:
                tmp_vec.append(k)
            elif self.debug:
                sc.message("{} field dropped !".format(k))

        return tmp_vec
Ejemplo n.º 5
0
 def preload_maps(self, folder: str = None):
     if folder:
         sc.message("Schema loaded !")
         tmp_schema = sc.load_json(folder)
         if "NUMBER_OF_PROPERTIES" in tmp_schema.keys():
             tmp_schema.pop("NUMBER_OF_PROPERTIES")
         return tmp_schema
     else:
         raise ValueError(
             "Parsed properties path cannot be None! Run schema pipe to build parsed props!"
         )
Ejemplo n.º 6
0
 def get_field_from_ad(self,
                       advertise: Dict[str, Any],
                       field_name: str,
                       alias: List[str],
                       default_value: Optional[str] = None):
     for key in advertise.keys():
         if any([key in field for field in alias]):
             return advertise[key]
     if self.debug:
         sc.message("No {0} column found @ advertise {1}".format(
             field_name, advertise))
     return default_value
Ejemplo n.º 7
0
 def load_model(self, model_path: str):
     model_json = os.path.join(model_path, "model_arc.json")
     model_weights = [
         os.path.join(model_path, x) for x in os.listdir(model_path)
         if 'weights' in x
     ]
     with open(model_json, 'r') as dt:
         model_arc = json.load(dt)
     model = model_from_json(json.dumps(model_arc))
     model.load_weights(model_weights[0])
     sc.message("Category model loaded!")
     return model
Ejemplo n.º 8
0
    def check_measure_fields(self, values: Set[str], tol: float = 0.7) -> bool:
        """
        Check if field is a 'measure field' - i.e. 64 gb, 5500 mah
        :param values: set of values
        :param tol: tolerance of minimum percentage of fit
        :return: True if is a measure field
        """
        counter: int = 0
        if type(values) is list:
            if len(values) == 0:
                sc.message("WARNING: NER MAPPER HAS A FIELD WITH NO VALUES.")
                return False

            for value in values:
                tmp = value.split()
                if len(tmp) == 2 and tmp[0].isnumeric():
                    counter += 1
            return (counter / len(values)) > tol
        else:
            return False
Ejemplo n.º 9
0
def build_advertise_generator(files: List[str]):
    qeue = deque(files)
    total_files = len(qeue)
    process_counter = 0
    while len(qeue) > 0:
        try:
            file_name = qeue.pop()

            if "jsonl" in file_name:
                for line in open(file_name, "r", encoding="utf-8"):
                    ad = json.loads(line)
                    print(ad)
                    yield ad
            else:
                ad = sc.load_json(file_name)
                for advertise in ad:
                    yield advertise

            process_counter += 1
            sc.message("{} PROCESSED!".format(file_name))
            sc.message("Worker {}% complete!".format(
                round(process_counter / total_files, 2) * 100))
        except Exception as err:
            sc.message(err)
    return None
Ejemplo n.º 10
0
def valid_advertise(advertise: Dict[str, Any],
                    minimum_cols: Tuple[str] = ('price', 'url'),
                    category: str = None,
                    market: str = None,
                    silent: bool = True) -> bool:
    """
    Checks if there are any missing crucial fields.
    In case of missing, adds default value (if there is one) or raises error.
    :param adv_dict: (dict) scrapped item
    :return: (bool) valid or not
    """

    try:
        # Checks if data has all necessary columns
        for col in minimum_cols:
            if col not in advertise.keys() or not advertise[col]:
                raise Exception(
                    '{0}:{1} - Missing necessary column: {2}'.format(
                        advertise["id"], "Missing Value", str(col)))
            elif col == "price" and not validate_prices(
                    advertise[col], advertise["category"]):
                raise Exception(
                    '{0}:{1} - Invalid price for category: {2} of {3}'.format(
                        advertise["id"], "Invalid Price",
                        advertise["category"], advertise["price"]))
            elif category is not None and advertise["category"] != category:
                raise Exception('{0}:{1} - Invalid category: {2}'.format(
                    advertise["id"], "Invalid Category",
                    advertise["category"]))
            elif market is not None and advertise["market"] != market:
                raise Exception('{0}:{1} - Invalid market: {2}'.format(
                    advertise["id"], "Invalid Market", advertise["market"]))

        return True
    except Exception as err:
        if not silent:
            sc.message(str(err))
        return False
Ejemplo n.º 11
0
    def reduce_process(self):
        workers_folder = [
            os.path.join(self.main_folder, folder)
            for folder in os.listdir(self.main_folder)
        ]

        sc.message("Processing files")
        print(workers_folder)

        reduced_folder = sc.check_folder(
            os.path.join(self.output_folder, "sequence"))

        with open(os.path.join(reduced_folder, "dataset.jsonl"),
                  "a",
                  encoding="utf-8") as js:
            for folderzin in workers_folder:
                for line in open(os.path.join(folderzin,
                                              "sequence_enriched.jsonl"),
                                 "r",
                                 encoding="utf-8"):
                    js.write(line)

        sc.message("DONE !")
Ejemplo n.º 12
0
def clean(files: List[str], encoder: "pipelines.encoder.BaseEncoder" = None):
    """
    Data pipeline method to be used @pararell_processing when data is already clean
    So, no parser is needed.
    :param files: file paths to be processed
    :param encoder: encoder class defined @config.gin
    """
    try:
        if not encoder:
            raise ValueError(
                "Encoder cannot be None. PLz Specificy a encoder @gin.config!")

        generator = ds.build_advertise_generator(files)
        enc_client = encoder()

        for ad in generator:
            try:
                enc_client.encode_advertise(ad)
            except Exception as err:
                sc.message(err)

        enc_client.save_maps()
    except Exception as erro:
        sc.message(erro)
Ejemplo n.º 13
0
    def reduce_process(self):
        workers_folder = [
            os.path.join(self.main_folder, folder)
            for folder in os.listdir(self.main_folder)
        ]

        if self.debug:
            print("Paths being aggregated...")
            print(workers_folder)

        char2idx = set()
        word2idx = set()
        tag2idx = set()

        sc.message("Processing files...")

        for folderzin in workers_folder:
            tmp_chars = sc.load_json(os.path.join(folderzin, "char2idx.json"))
            tmp_words = sc.load_json(os.path.join(folderzin, "word2idx.json"))
            tmp_tags = sc.load_json(os.path.join(folderzin, "tag2idx.json"))

            for c in tmp_chars.keys():
                char2idx.add(c)

            for w in tmp_words.keys():
                word2idx.add(w)

            for t in tmp_tags.keys():
                tag2idx.add(t)

        reduced_folder = sc.check_folder(
            os.path.join(self.output_folder, "ner_mapping"))
        basec2i = {"__PAD__": 0, "UNK": 1}
        basew2i = {"__PAD__": 0, "UNK": 1}
        baset2i = {"__PAD__": 0}

        char2idx.remove("__PAD__")
        char2idx.remove("UNK")
        word2idx.remove("__PAD__")
        word2idx.remove("UNK")
        tag2idx.remove("__PAD__")
        if "UNK" in tag2idx:
            tag2idx.remove("UNK")

        for i, c in enumerate(char2idx):
            basec2i[c] = i + 2

        for i, c in enumerate(word2idx):
            basew2i[c] = i + 2

        for i, c in enumerate(tag2idx):
            baset2i[c] = i + 1

        sc.message("Saving chars")
        sc.save_dict_2json(os.path.join(reduced_folder, "char2idx.json"),
                           basec2i)
        sc.message("Saving words")
        sc.save_dict_2json(os.path.join(reduced_folder, "word2idx.json"),
                           basew2i)
        sc.message("Saving tags")
        sc.save_dict_2json(os.path.join(reduced_folder, "tag2idx.json"),
                           baset2i)