Example #1
0
class ReutersImporter(ImporterBase):
    def count_docs_per_class(self):
        counts = {}
        for file in reuters.fileids():
            categories = reuters.categories(file)
            if len(categories) == 1:
                classname = categories[0]
                try:
                    counts[classname] += 1
                except KeyError:
                    counts[classname] = 1
        return counts

    def filter_classes(self, counts):
        newcounts = {}
        for key, val in counts.items():
            if val >= self.info['data_info']["min_docs_per_class"]:
                newcounts[key] = val
        return newcounts

    def load_valid_classes(self):
        counts = self.count_docs_per_class()
        counts = self.filter_classes(counts)
        self.valid_classes = list(counts)

    def load_documents(self):
        for file in ProgressIterator(reuters.fileids(), doc_progress_label):
            categories = reuters.categories(file)
            if len(categories) > 1:
                continue
            classname = categories[0]
            if not classname in self.valid_classes:
                continue
            class_id = self.classinfo.increase_class_count(classname)

            text = " ".join(reuters.words(file))
            text = re.sub("(\d+) \. (\d+)", r"\1.\2", text)
            text = re.sub("(\d+) \, (\d+)", r"\1,\2", text)
            text = re.sub(" \.", ".", text)
            text = re.sub(" \.", ".", text)
            text = re.sub(" \,", ",", text)
            text = re.sub(" \)", ")", text)
            text = re.sub("\( ", "(", text)
            text = re.sub(" \\' ", "'", text)

            self.docinfo.add_document(text, class_id)

    def run(self):
        # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html)
        nltk_path.append(join(config.paths["rawdata"], "nltk"))

        try:
            # Check which classes are valid depending on min_docs_per_class
            nbprint('Loading classes')
            self.load_valid_classes()

            # Load the documents
            with data.document_writer(self.info) as document_writer:
                # Initialize info classes
                self.classinfo = ClassInfo()
                self.docinfo = DocumentInfo(document_writer)

                # Load documents and store class information in classinfo
                self.load_documents()

            # Print Meta Information
            self.docinfo.save_meta(self.info)
            self.classinfo.save_meta(self.info)

        except (LookupError, FileNotFoundError):
            raise ImporterError(
                info,
                'Directory "{}" does not contain the required corpus.'.format(
                    nltk_path))

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
Example #2
0
class TweetsODPImporter(ImporterBase):
    def parse_files(self, jsonfile):
        nbprint("Loading documents")
        for line in ProgressIterator(jsonfile):

            tweet = json.loads(line)
            text = tweet["full_text"]

            id = int(tweet["id_str"])  #id field is incorrect/rounded
            classname = self.id_to_classname[id]

            if (self.max_docs_per_cls is not None
                    and self.classinfo.classes.get(
                        classname, (0, 0))[1] >= self.max_docs_per_cls):
                continue
            else:
                class_id = self.classinfo.increase_class_count(classname)
                self.docinfo.add_document(text, class_id)

    def load_id_to_classname(self, folderpath, filename):
        nbprint("Extracting tsv")

        self.id_to_classname = {}
        max_depth = self.info['data_info']['maxdepth']
        tarfilename = join(folderpath, filename + ".tar.bz2")

        with tarfile.open(tarfilename, "r:bz2") as tar:
            tsvfile = tar.extractfile(filename + ".tsv")
            for line in ProgressIterator(tsvfile):
                fields = line.decode().split()
                id = int(fields[0])
                classname = fields[3]

                classname = classname.strip("*")
                classhierarchy = classname.split("/")
                classhierarchy = classhierarchy[1:max_depth + 1]
                classname = "/".join(classhierarchy)

                self.id_to_classname[id] = classname

    def add_data(self, filename):
        nbprint("Loading '{}'".format(filename)).push()
        folderpath = join(config.paths["rawdata"], "tweetsodp")
        jsonfilename = join(folderpath, filename + ".json")
        zipfilename = join(folderpath, filename + ".json.zip")

        self.load_id_to_classname(folderpath, filename)
        if isfile(jsonfilename):
            with open(jsonfilename, "r") as jsonfile:
                self.parse_files(jsonfile)
        else:
            with zipfile.ZipFile(zipfilename) as zip:
                with zip.open(filename + ".json") as jsonfile:
                    self.parse_files(jsonfile)
        nbprint.pop()

    def run(self):
        self.classinfo = ClassInfo()
        self.max_docs_per_cls = self.info['data_info'].get(
            'max_docs_per_cls', None)
        with data.document_writer(self.info) as document_writer:
            self.docinfo = DocumentInfo(document_writer)
            self.add_data("ODPtweets-Mar17-29")
            self.add_data("ODPtweets-Apr12-24")

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)

        # Print Meta Info
        self.docinfo.save_meta(self.info)
        self.classinfo.save_meta(self.info)