def __init__(self, root, **kwargs): """ Initialize a PLoS reader with a specific corpus. Corpus information is contained in 'root/corpus_info.json' file. The @type root: string @param root: The directory path to the corpus directory. """ self._root = root fp = open( '%s/corpus_info.json' % (root), 'r' ) self._corpus_info = info = json.load(fp) fp.close() # doc_part is specific to PLoS and research article in general. # 'abstract' and 'body' are currently supported. # The corpus contains seperate text for each, but the # reader is initialized to readi only one. if 'doc_part' in kwargs: self._doc_part = doc_part = kwargs['doc_part'] del kwargs['doc_part'] else: self._doc_part = doc_part = 'body' if 'fileids' not in kwargs: fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] else: fileids = kwargs['fileids'] # cat_map f -> [ c1, c2, ...] # The fileids depend on what the doc_part is ('body', 'abstract') cat_map = {} for d,cat in info['d2c'].iteritems(): cat_map[doi2fn(d, doc_part)] = cat kwargs['cat_map'] = cat_map # Subclass of Categorized Plaintext Corpus Reader CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
def __init__(self, input_folder_name, doc_pattern, categ_pattern, encoding='utf-8'): CategorizedPlaintextCorpusReader.__init__(self, input_folder_name, doc_pattern, cat_pattern=categ_pattern) self.input_folder_name = input_folder_name self.encoding = encoding self.root_reader = PlaintextCorpusReader(input_folder_name, fileids=r'[^\/]*.' + doc_pattern[-3:]) #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()] self.root_ids = list(self.root_reader.fileids())
def __init__(self, root, **kwargs): """ Initialize a PLoS reader with a specific corpus. Corpus information is contained in 'root/corpus_info.json' file. The @type root: string @param root: The directory path to the corpus. """ self._root = root # corpus type is specific to Plos_builder # full - all documents that were built. # partial - documents excluding training # training - documents intended for training if 'corpus_type' in kwargs: self._corpus_type = kwargs['corpus_type'] del kwargs['corpus_type'] else: self._corpus_type = 'full' fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type) with open( fn, 'r' ) as fp: self._corpus_info = info = json.load(fp) # doc_part is specific to PLoS and research article. # 'abstract' and 'body' are currently supported. # The corpus contains seperate text for each, but the # reader is initialized to read only one. if 'doc_part' in kwargs: self._doc_part = doc_part = kwargs['doc_part'] del kwargs['doc_part'] else: self._doc_part = doc_part = 'body' if 'fileids' not in kwargs: fileids = [ doi2fn(d, doc_part) for d in self.dois() ] else: fileids = kwargs['fileids'] # cat_map f -> [ c1, c2, ...] # The fileids depend on what the doc_part is ('body', 'abstract') kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() } # Subclass of Categorized Plaintext Corpus Reader CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)