Esempio n. 1
0
    def get_fields(cls, file, encoding):
        sample_data = defaultdict(OrderedSet)

        for f, enc, _ in UploadScript._get_files(file, encoding):
            csvf = _open(f, encoding)
            reader = csv.DictReader(csvf)
            for row in itertools.islice(reader, 0, 5):
                for field_name, value in row.items():
                    if value.strip():
                        sample_data[field_name].add(value.strip())

        # Delete empty data
        for values in sample_data.values():
            if "" in values:
                values.remove("")

        # Guess types and destinations
        for field_name, values in sorted(sample_data.items(),
                                         key=itemgetter(0)):
            filtered_field_name = to_valid_field_name(field_name)
            try:
                value = next(iter(values))
            except StopIteration:
                value = None

            suggested_destination, suggested_type = guess_destination_and_type(
                filtered_field_name, value)
            yield ArticleField(field_name,
                               destination=suggested_destination,
                               values=list(
                                   itertools.islice(sample_data[field_name], 0,
                                                    5)),
                               suggested_type=suggested_type)
Esempio n. 2
0
 def parse_file(self, file, encoding, _data):
     reader = csv.DictReader(_open(file, encoding))
     for unmapped_dict in reader:
         art_dict = self.map_article(unmapped_dict)
         properties = {}
         for k, v in art_dict.items():
             v = parse_value(k, v)
             properties[k] = v
         yield Article.fromdict(properties)
Esempio n. 3
0
    def get_fields(cls, file: str, encoding: str):
        fields = OrderedDict()
        fieldMap = Language.reverseMap(cls.languages)
        for file, encoding, _ in cls._get_files(file, encoding):
            reader = csv.DictReader(_open(file, encoding), delimiter=";")
            rows = [row for row in reader]
            fields.update((k, (fieldMap[k], [row[k] for row in rows]))
                          for k in reader.fieldnames)

        for source, (destination, values) in fields.items():
            dest_name = ESFIELDS[destination]
            yield ArticleField(source, destination=dest_name, values=values)
Esempio n. 4
0
    def parse_file(self, file: str, encoding: str, _: None):
        self.queries = set()
        rows = csv.DictReader(_open(file, encoding), delimiter=";")
        self.lang = self._get_language(rows)

        yield from (self._scrape_unit(row) for row in rows)
Esempio n. 5
0
 def setUp(self):
     self.test_dir = os.path.join(os.path.dirname(__file__), 'test_files', 'defacto')
     self.test1 = os.path.join(self.test_dir, 'DeFacto-Campus - Ausdruck1.htm')
     self.test1_html = get_html(_open(self.test1, "autodetect"))
     self.test2 = os.path.join(self.test_dir, 'DeFacto-Campus - Ausdruck2.htm')
     self.test2_html = get_html(_open(self.test2, "autodetect"))