Beispiel #1
0
    def __init__(self, root, **kwargs):
        """
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

        @type  root: string
	@param root: The directory path to the corpus directory.
        """
        self._root = root
        fp = open( '%s/corpus_info.json' % (root), 'r' )
        self._corpus_info = info = json.load(fp)
        fp.close()

        # doc_part is specific to PLoS and research article in general.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to readi only one.
	if 'doc_part' in kwargs:
            self._doc_part = doc_part = kwargs['doc_part']
	    del kwargs['doc_part']
	else:
	    self._doc_part = doc_part = 'body'
	if 'fileids' not in kwargs:
            fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] 
        else:
	    fileids =  kwargs['fileids']
        # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
	cat_map = {}
        for d,cat in info['d2c'].iteritems():
            cat_map[doi2fn(d, doc_part)] = cat

	kwargs['cat_map'] = cat_map
	# Subclass of Categorized Plaintext Corpus Reader
        CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
Beispiel #2
0
    def __init__(self,
                 input_folder_name,
                 doc_pattern,
                 categ_pattern,
                 encoding='utf-8'):
        CategorizedPlaintextCorpusReader.__init__(self,
                                                  input_folder_name,
                                                  doc_pattern,
                                                  cat_pattern=categ_pattern)
        self.input_folder_name = input_folder_name
        self.encoding = encoding
        self.root_reader = PlaintextCorpusReader(input_folder_name,
                                                 fileids=r'[^\/]*.' +
                                                 doc_pattern[-3:])
        #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()]

        self.root_ids = list(self.root_reader.fileids())
Beispiel #3
0
  def __init__(self, root, **kwargs):
    """ 
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

    @type  root: string
	@param root: The directory path to the corpus.
    """
    self._root = root
    
    # corpus type is specific to Plos_builder
    # full - all documents that were built.
    # partial - documents excluding training 
    # training - documents intended for training
    if 'corpus_type' in kwargs:
      self._corpus_type = kwargs['corpus_type']
      del kwargs['corpus_type']
    else:
      self._corpus_type = 'full'
    
    fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type)
    with open( fn, 'r' ) as fp:
      self._corpus_info = info = json.load(fp)

    # doc_part is specific to PLoS and research article.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to read only one.
    if 'doc_part' in kwargs:
      self._doc_part = doc_part = kwargs['doc_part']
      del kwargs['doc_part']
    else:
      self._doc_part = doc_part = 'body'
    
    if 'fileids' not in kwargs:
      fileids = [ doi2fn(d, doc_part) for d in self.dois() ] 
    else:
	    fileids =  kwargs['fileids']
    # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
    kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() }
	  # Subclass of Categorized Plaintext Corpus Reader
    CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)