def __init__(self, root, **kwargs): """ Initialize a PLoS reader with a specific corpus. Corpus information is contained in 'root/corpus_info.json' file. The @type root: string @param root: The directory path to the corpus directory. """ self._root = root fp = open( '%s/corpus_info.json' % (root), 'r' ) self._corpus_info = info = json.load(fp) fp.close() # doc_part is specific to PLoS and research article in general. # 'abstract' and 'body' are currently supported. # The corpus contains seperate text for each, but the # reader is initialized to readi only one. if 'doc_part' in kwargs: self._doc_part = doc_part = kwargs['doc_part'] del kwargs['doc_part'] else: self._doc_part = doc_part = 'body' if 'fileids' not in kwargs: fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] else: fileids = kwargs['fileids'] # cat_map f -> [ c1, c2, ...] # The fileids depend on what the doc_part is ('body', 'abstract') cat_map = {} for d,cat in info['d2c'].iteritems(): cat_map[doi2fn(d, doc_part)] = cat kwargs['cat_map'] = cat_map # Subclass of Categorized Plaintext Corpus Reader CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
def _write_doc(self, base_dir, doc, doi): """ Write the abstract and body files. """ fn_body = '{d}/{f}'.format(d=base_dir, f=doi2fn(doi, 'body')) fn_abstract = '{d}/{f}'.format(d=base_dir, f=doi2fn(doi, 'abstract')) with codecs.open(fn_body, 'w', encoding='utf-8') as fd_body: fd_body.write(doc['body']) with codecs.open(fn_abstract, 'w', encoding='utf-8') as fd_abstract: fd_abstract.write(doc['abstract'][0]) return
def _article_info(self, doc, doi): fields = ['title', 'author', 'editor', 'publication_date', 'article_type', 'journal', 'id' ] article_info = field_list_to_dict(doc, fields) article_info['page_url'] = article_page_url(doi, pretty=True) article_info['xml_url'] = article_xml_url(doi, pretty=True) article_info['body_fid'] = doi2fn(doi, 'body') article_info['abstract_fid'] = doi2fn(doi, 'abstract') return article_info
def _article_info(self, doc, doi): fields = [ 'title', 'author', 'editor', 'publication_date', 'article_type', 'journal', 'id' ] article_info = field_list_to_dict(doc, fields) article_info['page_url'] = article_page_url(doi, pretty=True) article_info['xml_url'] = article_xml_url(doi, pretty=True) article_info['body_fid'] = doi2fn(doi, 'body') article_info['abstract_fid'] = doi2fn(doi, 'abstract') return article_info
def __init__(self, root, **kwargs): """ Initialize a PLoS reader with a specific corpus. Corpus information is contained in 'root/corpus_info.json' file. The @type root: string @param root: The directory path to the corpus. """ self._root = root # corpus type is specific to Plos_builder # full - all documents that were built. # partial - documents excluding training # training - documents intended for training if 'corpus_type' in kwargs: self._corpus_type = kwargs['corpus_type'] del kwargs['corpus_type'] else: self._corpus_type = 'full' fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type) with open( fn, 'r' ) as fp: self._corpus_info = info = json.load(fp) # doc_part is specific to PLoS and research article. # 'abstract' and 'body' are currently supported. # The corpus contains seperate text for each, but the # reader is initialized to read only one. if 'doc_part' in kwargs: self._doc_part = doc_part = kwargs['doc_part'] del kwargs['doc_part'] else: self._doc_part = doc_part = 'body' if 'fileids' not in kwargs: fileids = [ doi2fn(d, doc_part) for d in self.dois() ] else: fileids = kwargs['fileids'] # cat_map f -> [ c1, c2, ...] # The fileids depend on what the doc_part is ('body', 'abstract') kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() } # Subclass of Categorized Plaintext Corpus Reader CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
def doi2fid(self, doi_lst=None): """ """ dois = listafy(doi_lst, self._corpus_info['d2c']) return zip(dois, doi2fn(dois, self._doc_part))
def add(self, docs): """ Create a json file for each doc in the document list. @type docs: list @param docs: A list containing the results of a PLoS search query. Each item is a dictionary with QUERY_RTN_FLDS as keys. @return: Nothing """ root = self._root info = self._corpus_info d2cmap = {} c2dmap = {} d2infomap = {} amap = info['article_link'] xmap = info['xml_link'] # Build all the lists and mappings for doc in docs: doi = doc['id'] # If the doc has not subject, add [] if 'subject' not in doc: doc['subject'] = [] # File id -> [ c1, c2, .... ] d2cmap[doi] = subjs = doc['subject'] # Category -> [ f1, f2, .... ] for s in subjs: if s in c2dmap: c2dmap[s].append(doi) else: c2dmap[s] = [doi] # doi -> article link amap[doi] = articleUrl(doi) # doi -> artilce xml link xmap[doi] = articleXML(doi) # Depending on the article type some of these might not exist. jrnl = doc['journal'] if 'journal' in doc else '' if 'publication_date' in doc: pub_date = doc['publication_date'] else: pub_date = '' atype = doc['article_type'] if 'article_type' in doc else '' title = doc['title'] if 'title' in doc else '' author = doc['author'] if 'author' in doc else [] d2infomap[doi] = (jrnl, pub_date, atype, title, author) fnames = [doi2fn(doi, 'body') for doi in d2cmap.keys()] fnames_docs = zip(fnames, docs) # Dump doc_part 'body' into individual files. for fn, doc in fnames_docs: fd = codecs.open('%s/%s' % (root, fn), 'w', encoding='utf-8') fd.write(doc['body']) fd.close() fnames = [doi2fn(doi, 'abstract') for doi in d2cmap.keys()] fnames_docs = zip(fnames, docs) # Dump doc_part 'abstract' into individual files. for fn, doc in fnames_docs: fd = codecs.open('%s/%s' % (root, fn), 'w', encoding='utf-8') fd.write(doc['abstract'][0]) fd.close() # Update the corpus info info['d2c'].update(d2cmap) c2d = info['c2d'] for k, v in c2dmap.iteritems(): if k not in c2d: c2d[k] = [] c2d[k].extend(v) info['d2info'].update(d2infomap) return