def setUp(self) -> None: self.corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader( "Corpus/Processed_corpus/") self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corpus, n_folds=12, shuffle=False) self.subset = next(self.loader.fileids(test=True))
def setUp(self) -> None: self.corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader( "Corpus/Processed_corpus/") self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corpus, n_folds=12, shuffle=False) self.subset = next(self.loader.fileids(test=True)) self.model = Pipeline([("norm", Corpus_Vectorizer.TitleNormalizer()), ("vect", Corpus_Vectorizer.OneHotVectorizer()), ('clusters', Corpus_Cluster.HierarchicalClustering())])
def process_corpus(): corp = Elsevier_Corpus_Reader.ScopusRawCorpusReader( "Corpus/Processed_corpus/") formatter = Elsivier_Corpus_Pre_Processor.PickledCorpusPreProcessor(corp) formatter.transform()
def __init__(self, path): """ Initialise the author network Parameters ---------- path : string like path to corpus """ self.path = path self.corpus = Elsevier_Corpus_Reader.ScopusRawCorpusReader(path)
def document_feature_counter(path, feature='pub_date', sort=False, how='count', **kwargs) -> dict: """ utility for counting the number of instances observed for a given feature in the document meta data Parameters ---------- path: str path to the corpus feature: str feature to be counted 'pub_date' - date of publication 'pub_type' - type of document, eg. Article, Review ... 'publication' - journal in which the document is published sort: bool should the output dictionary be sorted or not how: str if the output should be sorted, how should it be sorted 'class' - sorted by the class, requires a sortable class, eg. dates 'count' - sorted by the number of counts of a class kwargs: optional arguments that can be piped through to an underlying corpus reader method. Returns ------- dict like object, either a Counter object or an OrderedDict """ corp = Elsevier_Corpus_Reader.ScopusRawCorpusReader(path) feature_map = {'pub_date': corp.pub_date, # 'pub_type': corp.pub_type, 'publication': corp.publication, 'author_count': corp.author_count} sort_how_map = {'class': 0, 'count': 1} if kwargs: data = Counter(feature_map[feature](**kwargs)) else: data = Counter(feature_map[feature]()) if not sort: return data else: sorted_data = sorted(data.items(), key=lambda kv: kv[sort_how_map[ how]]) return OrderedDict(sorted_data)
def plot_clusters(X, y, **kwargs) -> None: fig, ax = plt.subplots(figsize=(10, 5)) ax = sns.scatterplot(x=X[:,0], y=X[:,1], hue=y) plt.tight_layout() plt.show() if __name__ == '__main__': from CorpusReader import Elsevier_Corpus_Reader from CorpusProcessingTools import Corpus_Vectorizer from CorpusProcessingTools import Corpus_Cluster corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader( "Corpus/Processed_corpus/") loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(corpus, 100, shuffle=False) subset = next(loader.fileids(test=True)) docs = list(corpus.title_tagged(fileids=subset)) # # Plot hierarchical clustering # model = Pipeline([ # ("norm", Corpus_Vectorizer.TitleNormalizer()), # ("vect", Corpus_Vectorizer.OneHotVectorizer()), # ('clusters', Corpus_Cluster.HierarchicalClustering()) # ]) # # clusters = model.fit_transform(docs) # labels = model.named_steps['clusters'].labels
formatter = Elsivier_Corpus_Pre_Processor.PickledCorpusPreProcessor(corp) formatter.transform() def plot_features(): AN = Author_Networks.AuthorNetworks("Corpus/Processed_corpus/") # AN.plot_co_author_network(categories='soft robot/2000') AN.co_author_network_bokeh_better(categories=['soft robot/2000', 'soft robot/2001', 'soft robot/2002']) if __name__ == '__main__': # step 1: download the raw corpus from elsivier # download_corpus() # step 2: reformat the corpus for faster manipulation # reformat_corpus() # step 3: reformat the corpus for faster manipulation process_corpus() # step 4: load the corpus reader corp = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader( "Corpus/Processed_corpus/") # step 5: plot author connectivity # plot_features()
def setUp(self) -> None: self.corp = Elsevier_Corpus_Reader.ScopusRawCorpusReader( "Corpus/Processed_corpus/")
def setUp(self) -> None: self.corp = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader( "Corpus/Processed_corpus/") self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corp, n_folds=12, shuffle=False)