Beispiel #1
0
    def __init__(self,
                 a_subcorpus,
                 tag,
                 a_cursor=None,
                 extension="name",
                 indexing="word"):
        abstract_bank.__init__(self, a_subcorpus, tag, extension)

        if (a_cursor == None):
            sys.stderr.write("reading the name bank [%s]..." % self.extension)

            for a_file in self.subcorpus.get_files(self.extension):
                sys.stderr.write(".")

                name_file = codecs.open(a_file.physical_filename, "r", "utf-8")
                try:
                    name_tagged_document_string = name_file.read()
                finally:
                    name_file.close()

                name_tagged_document_id = "%s@%s" % (a_file.document_id,
                                                     self.subcorpus.id)

                a_name_tagged_document = name_tagged_document(
                    name_tagged_document_string,
                    name_tagged_document_id,
                    self.extension,
                    indexing=indexing)
                self.append(a_name_tagged_document)
            sys.stderr.write("\n")
        else:
            pass
Beispiel #2
0
    def __init__(self, a_subcorpus, tag, a_cursor=None, extension="parallel"):
        abstract_bank.__init__(self, a_subcorpus, tag, extension)

        self.matching_parallel_banks = []
        self.matching_treebanks = []
        self.matching_subcorpora = []

        if (a_cursor == None):
            sys.stderr.write("reading the parallel bank [%s] ..." %
                             self.extension)
            for a_file in self.subcorpus.get_files(self.extension):
                sys.stderr.write(".")

                with codecs.open(a_file.physical_filename, "r", "utf-8") as f:
                    parallel_file_lines = f.readlines()

                if parallel_file_lines[0].startswith("original document"):
                    """ we want to map translated documents back to their originals.
                    There are two mapping files, which means there is redundant information,
                    and we just need to do thing with the parallel files that represent
                    translations. """
                    continue

                self.append(
                    parallel_document.from_file(parallel_file_lines,
                                                a_file.document_id,
                                                a_subcorpus.id,
                                                self.extension))
            sys.stderr.write("\n")
        else:
            pass
Beispiel #3
0
    def __init__(self, a_subcorpus, tag, a_cursor=None, extension="parallel"):
        abstract_bank.__init__(self, a_subcorpus, tag, extension)

        self.matching_parallel_banks = []
        self.matching_treebanks = []
        self.matching_subcorpora = []

        if(a_cursor == None):
            sys.stderr.write("reading the parallel bank [%s] ..." % self.extension)
            for a_file in self.subcorpus.get_files(self.extension):
                sys.stderr.write(".")

                with codecs.open(a_file.physical_filename, "r", "utf-8") as f:
                    parallel_file_lines = f.readlines()

                if parallel_file_lines[0].startswith("original document"):
                    """ we want to map translated documents back to their originals.
                    There are two mapping files, which means there is redundant information,
                    and we just need to do thing with the parallel files that represent
                    translations. """
                    continue

                self.append(parallel_document.from_file(parallel_file_lines, a_file.document_id, a_subcorpus.id, self.extension))
            sys.stderr.write("\n")
        else:
            pass
Beispiel #4
0
    def __init__(self, a_subcorpus, tag, a_cursor=None, extension="speaker"):
        abstract_bank.__init__(self, a_subcorpus, tag, extension)

        if(a_cursor == None):
            sys.stderr.write("reading the speaker bank [%s] ..." % self.extension)
            for a_file in self.subcorpus.get_files(self.extension):
                sys.stderr.write(".")

                with codecs.open(a_file.physical_filename, "r", "utf-8") as f:
                    speaker_file_lines = f.readlines()

                self.append(speaker_document.from_file(speaker_file_lines, a_file.document_id + "@" + a_subcorpus.id, self.extension))
            sys.stderr.write("\n")
        else:
            pass
Beispiel #5
0
    def __init__(self, a_subcorpus, tag, a_cursor=None, extension="name", indexing="word"):
        abstract_bank.__init__(self, a_subcorpus, tag, extension)

        if(a_cursor == None):
            sys.stderr.write("reading the name bank [%s]..." % self.extension)

            for a_file in self.subcorpus.get_files(self.extension):
                sys.stderr.write(".")

                name_file = codecs.open(a_file.physical_filename, "r", "utf-8")
                try:
                    name_tagged_document_string = name_file.read()
                finally:
                    name_file.close()

                name_tagged_document_id = "%s@%s" % (a_file.document_id, self.subcorpus.id)

                a_name_tagged_document = name_tagged_document(name_tagged_document_string, name_tagged_document_id,
                                                              self.extension, indexing=indexing)
                self.append(a_name_tagged_document)
            sys.stderr.write("\n")
        else:
            pass