def __init__(self):
     """
         Initializes the threshold for tagging a document with an SDG, module loader, publication loader and output pickle file.
     """
     self.threshold = 20  # threshold value for tagging a document with an SDG, for a probability greater than this value.
     self.module_loader = ModuleLoader()
     self.publication_loader = PublicationLoader()
     self.module_preprocessor = ModuleCataloguePreprocessor()
     self.publication_preprocessor = Preprocessor()
     self.svm_dataset = "main/NLP/SVM/SVM_dataset_sdg.pkl"
     self.num_sdgs = 18
     self.df_modules = self.module_loader.load(
         "MAX")  # dataframe with columns {Module_ID, Description}.
     self.df_publications = self.publication_loader.load(
         "MAX")  # dataframe with columns {DOI, Title, Description}.
Ejemplo n.º 2
0
    def __init__(self):
        """
            Initializes the threshold for tagging a document with an SDG, module loader, publication loader and output pickle file.
        """
        self.threshold = 20  # threshold value for tagging a document with an SDG, for a probability greater than this value.
        self.publication_loader = PublicationLoader()
        self.publication_preprocessor = Preprocessor()
        self.svm_dataset = "main/NLP/SVM/SVM_dataset_ihe.csv"
        # dataframe with columns {DOI, Title, Description}.
        self.df_publications = self.publication_loader.load("MAX")

        with open(
                "main/NLP/LDA/IHE_RESULTS/training_results.json") as json_file:
            self.data = json.load(json_file)
        self.num_ihes = len(self.data['Topic Words'])
Ejemplo n.º 3
0
 def __init__(self):
     """
         Initialize state of IheLda with default preprocessor, publication data loader, scopus publications data, list of IHE-specific 
         keywords, number of IHE research expertise topics, text vectorizer and model.
     """
     self.preprocessor = Preprocessor()
     self.loader = PublicationLoader()
     self.data = None  # publication dataframe with columns {DOI, Description}.
     self.keywords = None  # list of IHE-specific keywords.
     self.num_topics = 0
     self.vectorizer = self.get_vectorizer(1, 3, 1, 0.2)
     self.model = None
    def publication_string_matches_results(self) -> dict:
        """
            Loads string matching keyword counts for scopus publications and stores the results as a dictionary.
        """
        data = PublicationLoader().load_string_matches_results()
        data = json.loads(json_util.dumps(
            data))  # process mongodb response to a workable dictionary format.
        results = {}  # dictionary with DOI and SDG keyword counts.

        for doi in data:
            sdg_dict = data[doi]['Related_SDG']
            counts = [0] * self.num_sdgs

            for sdg, word_found_dict in sdg_dict.items():
                sdg_match = re.search(r'\d(\d)?', sdg)
                sdg_num = int(sdg_match.group()
                              ) if sdg_match is not None else self.num_sdgs
                count = len(word_found_dict['Word_Found'])
                counts[sdg_num - 1] = count

            results[
                doi] = counts  # add DOI with array of SDG keyword counts to results.

        return results
 def load_publications(self) -> None:
     """
         Load publications from pymongo database and serialize.
     """
     PublicationLoader().load_pymongo_db()
Ejemplo n.º 6
0
class IheSvmDataset():
    """
        Creates UCL modules and Scopus research publications dataset with SDG tags for training the SVM.
        The dataset is a dataframe with columns {ID, Description, SDG} where ID is either Module_ID or DOI.
    """
    def __init__(self):
        """
            Initializes the threshold for tagging a document with an SDG, module loader, publication loader and output pickle file.
        """
        self.threshold = 20  # threshold value for tagging a document with an SDG, for a probability greater than this value.
        self.publication_loader = PublicationLoader()
        self.publication_preprocessor = Preprocessor()
        self.svm_dataset = "main/NLP/SVM/SVM_dataset_ihe.csv"
        # dataframe with columns {DOI, Title, Description}.
        self.df_publications = self.publication_loader.load("MAX")

        with open(
                "main/NLP/LDA/IHE_RESULTS/training_results.json") as json_file:
            self.data = json.load(json_file)
        self.num_ihes = len(self.data['Topic Words'])

    def __progress(self,
                   count: int,
                   total: int,
                   custom_text: str,
                   suffix: str = '') -> None:
        """
            Visualises progress for a process given a current count and a total count.
        """
        bar_len = 60
        filled_len = int(round(bar_len * count / float(total)))
        percents = round(100.0 * count / float(total), 1)
        bar = '*' * filled_len + '-' * (bar_len - filled_len)
        sys.stdout.write('[%s] %s%s %s %s\r' %
                         (bar, percents, '%', custom_text, suffix))
        sys.stdout.flush()

    def get_publication_description(self, doi: str):
        """
            Returns the publication description for a particular DOI.
        """

        # search for row in dataframe by DOI.
        df = self.df_publications.loc[self.df_publications["DOI"] == doi]
        return None if len(df) == 0 else df["Description"].values[0]

    def tag_publications(self):
        """
            Returns a dataframe with columns {ID, Description, SDG} for each publication, where SDG is a class tag for training the SVM.
        """
        results = pd.DataFrame(columns=['ID', 'Description',
                                        'IHE'])  # ID = DOI

        num_publications = len(self.data['Document Topics'])
        final_data = {}
        counter = 0

        for doi in self.data['Document Topics']:
            self.__progress(counter, num_publications,
                            "Forming Publications IHE Dataset for SVM...")
            raw_weights = self.data['Document Topics'][doi]
            weights = [0] * self.num_ihes
            for i in range(self.num_ihes):
                ihe_num = str(i + 1)
                weight = raw_weights[i][4:-2]
                try:
                    # convert probabilities in the range [0,1] to percentages.
                    w = float(weight)
                except:
                    w = 0.0
                weights[i] = w

            weights = np.asarray(weights)
            # gets SDG corresponding to the maximum weight.
            ihe_max = weights.argmax() + 1
            ihe_weight_max = weights[ihe_max - 1]  # gets the maximum weight.

            description = self.get_publication_description(doi)
            description = "" if description is None else self.publication_preprocessor.preprocess(
                description)

            if description != "":
                if ihe_weight_max >= self.threshold:
                    # Set SDG tag of publication to the SDG which has the maximum weight if its greater than the threshold value.
                    row_df = pd.DataFrame([[doi, description, ihe_max]],
                                          columns=results.columns)
                else:
                    # Set SDG tag of module to None if the maximum weight is less than the threshold value.
                    row_df = pd.DataFrame([[doi, description, None]],
                                          columns=results.columns)

                results = results.append(row_df,
                                         verify_integrity=True,
                                         ignore_index=True)

            counter += 1
        print()
        return results

    def run(self) -> None:
        """
            Tags the modules and/or publications with their most related SDG, if related to one at all, and combines them into a single dataframe.
            Serializes the resulting dataframe as a pickle file.
        """
        df = pd.DataFrame(
        )  # column format of dataframe is {ID, Description, SDG} where ID is either Module_ID or DOI.
        df = df.append(self.tag_publications(), ignore_index=True)

        df = df.reset_index()
        df.to_csv(self.svm_dataset)
 def __init__(self):
     self.publiction_data = pd.DataFrame(
         columns=['DOI', 'Title', 'Description'])
     self.model_name = "main/NLP/LDA/SDG_RESULTS/model.pkl"
     self.loader = PublicationLoader()
class ScopusPrediction():
    def __init__(self):
        self.publiction_data = pd.DataFrame(
            columns=['DOI', 'Title', 'Description'])
        self.model_name = "main/NLP/LDA/SDG_RESULTS/model.pkl"
        self.loader = PublicationLoader()

    def __progress(self, count, total, custom_text, suffix=''):
        bar_len = 60
        filled_len = int(round(bar_len * count / float(total)))
        percents = round(100.0 * count / float(total), 1)
        bar = '*' * filled_len + '-' * (bar_len - filled_len)
        sys.stdout.write('[%s] %s%s %s %s\r' %
                         (bar, percents, '%', custom_text, suffix))
        sys.stdout.flush()

    def __writeToDB_Scopus(self, data):
        value = data
        col.update_one({"DOI": data["DOI"]}, {"$set": value}, upsert=True)

    def make_predictions(self, limit):
        results = {}
        counter = 1
        papers = self.publiction_data.head(
            limit) if limit else self.publiction_data
        num_papers = len(papers)

        with open(self.model_name, 'rb') as f:
            lda = pickle.load(f)
            for i in range(num_papers):
                self.__progress(counter, num_papers, "Predicting...")
                description = papers['Description'][i]

                X_predicted = lda.vectorizer.transform([description])
                C_predicted = gensim.matutils.Sparse2Corpus(
                    X_predicted, documents_columns=False)
                topic_distribution = lda.model.get_document_topics(C_predicted)

                td = [x for x in topic_distribution]
                td = td[0]
                results[papers['DOI'][i]] = {}
                for topic, pr in td:
                    results[papers['DOI'][i]]['Title'] = papers['Title'][i]
                    results[papers['DOI'][i]]['DOI'] = papers['DOI'][i]
                    results[papers['DOI'][i]][str(topic + 1)] = str(pr)

                self.__writeToDB_Scopus(results[papers['DOI'][i]])
                counter += 1

        print()
        with open("main/NLP/LDA/SDG_RESULTS/scopus_prediction_results.json",
                  "w") as f:
            json.dump(results, f)
        client.close()

    def load_publications(self):
        data = self.loader.load_all()
        for i in data:
            i = json.loads(json_util.dumps(i))
            abstract = data[i]["Abstract"]
            doi = data[i]["DOI"]
            if abstract and doi:
                title = data[i]["Title"]
                author_keywords = data[i]['AuthorKeywords']
                index_keywords = data[i]['IndexKeywords']
                subject_areas = data[i]['SubjectAreas']

                concat_data_fields = title + " " + abstract
                if author_keywords:
                    concat_data_fields += " " + " ".join(author_keywords)
                if index_keywords:
                    concat_data_fields += " " + " ".join(index_keywords)
                if subject_areas:
                    subject_name = [x[0] for x in subject_areas]
                    concat_data_fields += " " + " ".join(subject_name)

                row_df = pd.DataFrame([[doi, title, concat_data_fields]],
                                      columns=self.publiction_data.columns)
                self.publiction_data = self.publiction_data.append(
                    row_df, verify_integrity=True, ignore_index=True)

    def predict(self):
        self.load_publications()
        self.make_predictions(limit=None)
class SdgSvmDataset():
    """
        Creates UCL modules and Scopus research publications dataset with SDG tags for training the SVM.
        The dataset is a dataframe with columns {ID, Description, SDG} where ID is either Module_ID or DOI.
    """
    def __init__(self):
        """
            Initializes the threshold for tagging a document with an SDG, module loader, publication loader and output pickle file.
        """
        self.threshold = 20  # threshold value for tagging a document with an SDG, for a probability greater than this value.
        self.module_loader = ModuleLoader()
        self.publication_loader = PublicationLoader()
        self.module_preprocessor = ModuleCataloguePreprocessor()
        self.publication_preprocessor = Preprocessor()
        self.svm_dataset = "main/NLP/SVM/SVM_dataset_sdg.pkl"
        self.num_sdgs = 18
        self.df_modules = self.module_loader.load(
            "MAX")  # dataframe with columns {Module_ID, Description}.
        self.df_publications = self.publication_loader.load(
            "MAX")  # dataframe with columns {DOI, Title, Description}.

    def __progress(self,
                   count: int,
                   total: int,
                   custom_text: str,
                   suffix: str = '') -> None:
        """
            Visualises progress for a process given a current count and a total count.
        """
        bar_len = 60
        filled_len = int(round(bar_len * count / float(total)))
        percents = round(100.0 * count / float(total), 1)
        bar = '*' * filled_len + '-' * (bar_len - filled_len)
        sys.stdout.write('[%s] %s%s %s %s\r' %
                         (bar, percents, '%', custom_text, suffix))
        sys.stdout.flush()

    def get_module_description(self, module_id: str):
        """
            Returns the module description for a particular module_id.
        """

        # search for row in dataframe by Module_ID.
        df = self.df_modules.loc[self.df_modules["Module_ID"] == module_id]
        return None if len(df) == 0 else df["Module_Description"].values[0]

    def get_publication_description(self, doi: str):
        """
            Returns the publication description for a particular DOI.
        """

        # search for row in dataframe by DOI.
        df = self.df_publications.loc[self.df_publications["DOI"] == doi]
        return None if len(df) == 0 else df["Description"].values[0]

    def tag_modules(self):
        """
            Returns a dataframe with columns {ID, Description, SDG} for each module, where SDG is a class tag for training the SVM.
        """
        results = pd.DataFrame(columns=['ID', 'Description',
                                        'SDG'])  # ID = Module_ID
        data = self.module_loader.load_lda_prediction_results(
        )  # loads data from the ModulePrediction table in mongodb.

        doc_topics = data['Document Topics']
        num_modules = len(doc_topics)
        final_data = {}
        counter = 0

        for module_id in doc_topics:
            self.__progress(counter, num_modules,
                            "Forming Modules Dataset for SVM...")
            raw_weights = doc_topics[module_id]
            weights = []
            for i in range(len(raw_weights)):
                raw_weights[i] = raw_weights[i].replace('(', '').replace(
                    ')', '').replace('%', '').replace(' ', '').split(',')
                sdg_num = int(raw_weights[i][0])
                try:
                    w = float(raw_weights[i][1])
                except:
                    w = 0.0
                weights.append((sdg_num, w))

            sdg_weight_max = max(weights, key=lambda x: x[
                1])  # get tuple (sdg, weight) with the maximum weight.

            description = self.get_module_description(module_id)
            description = "" if description is None else self.module_preprocessor.preprocess(
                description)

            if description != "":
                if sdg_weight_max[1] >= self.threshold:
                    # Set SDG tag of module to the SDG which has the maximum weight if its greater than the threshold value.
                    row_df = pd.DataFrame(
                        [[module_id, description, sdg_weight_max[0]]],
                        columns=results.columns)
                else:
                    # Set SDG tag of module to None if the maximum weight is less than the threshold value.
                    row_df = pd.DataFrame([[module_id, description, None]],
                                          columns=results.columns)

                results = results.append(row_df,
                                         verify_integrity=True,
                                         ignore_index=True)

            counter += 1

        return results

    def tag_publications(self):
        """
            Returns a dataframe with columns {ID, Description, SDG} for each publication, where SDG is a class tag for training the SVM.
        """
        results = pd.DataFrame(columns=['ID', 'Description',
                                        'SDG'])  # ID = DOI
        data = self.publication_loader.load_lda_prediction_results(
        )  # loads data from the PublicationPrediction table in mongodb.

        num_publications = len(data)
        final_data = {}
        counter = 0

        for doi in data:
            self.__progress(counter, num_publications,
                            "Forming Publications Dataset for SVM...")
            raw_weights = data[doi]
            weights = [0] * self.num_sdgs
            for i in range(self.num_sdgs):
                sdg_num = str(i + 1)
                try:
                    w = float(
                        raw_weights[sdg_num]
                    ) * 100.0  # convert probabilities in the range [0,1] to percentages.
                except:
                    w = 0.0
                weights[i] = w

            weights = np.asarray(weights)
            sdg_max = weights.argmax(
            ) + 1  # gets SDG corresponding to the maximum weight.
            sdg_weight_max = weights[sdg_max - 1]  # gets the maximum weight.

            description = self.get_publication_description(doi)
            description = "" if description is None else self.publication_preprocessor.preprocess(
                description)

            if description != "":
                if sdg_weight_max >= self.threshold:
                    # Set SDG tag of publication to the SDG which has the maximum weight if its greater than the threshold value.
                    row_df = pd.DataFrame([[doi, description, sdg_max]],
                                          columns=results.columns)
                else:
                    # Set SDG tag of module to None if the maximum weight is less than the threshold value.
                    row_df = pd.DataFrame([[doi, description, None]],
                                          columns=results.columns)

                results = results.append(row_df,
                                         verify_integrity=True,
                                         ignore_index=True)

            counter += 1

        return results

    def run(self, modules: bool, publications: bool) -> None:
        """
            Tags the modules and/or publications with their most related SDG, if related to one at all, and combines them into a single dataframe.
            Serializes the resulting dataframe as a pickle file.
        """
        df = pd.DataFrame(
        )  # column format of dataframe is {ID, Description, SDG} where ID is either Module_ID or DOI.
        if modules:
            df = df.append(self.tag_modules(), ignore_index=True)
        if publications:
            df = df.append(self.tag_publications(), ignore_index=True)

        df = df.reset_index()
        df.to_pickle(self.svm_dataset)
Ejemplo n.º 10
0
 def __init__(self):
     self.loader = PublicationLoader()
     self.mongodb_pusher = MongoDbPusher()
     self.preprocessor = Preprocessor()
Ejemplo n.º 11
0
class ScopusStringMatch():
    def __init__(self):
        self.loader = PublicationLoader()
        self.mongodb_pusher = MongoDbPusher()
        self.preprocessor = Preprocessor()

    def __progress(self, count, total, custom_text, suffix=''):
        """
            Visualises progress for a process given a current count and a total count
        """

        bar_len = 60
        filled_len = int(round(bar_len * count / float(total)))
        percents = round(100.0 * count / float(total), 1)
        bar = '*' * filled_len + '-' * (bar_len - filled_len)
        sys.stdout.write('[%s] %s%s %s %s\r' %
                         (bar, percents, '%', custom_text, suffix))
        sys.stdout.flush()

    def __read_keywords(self, data: dict) -> None:
        """
            Given a set of publications in a dictionary, performs pre-processing for all string type data fields.
            Performs look-up on SDG keyword occurences in a document.
            Results are pushed to MongoDB (backed-up in JSON file - scopus_matches.json).
        """

        resulting_data = {}
        counter = 0
        keywords = self.preprocessor.preprocess_keywords(
            "main/SDG_KEYWORDS/SDG_Keywords.csv")
        num_publications = len(data)
        num_keywords = len(keywords)

        for doi, publication in data.items():
            self.__progress(counter, num_publications,
                            "processing scopus_matches.json")
            counter += 1
            description = self.preprocessor.tokenize(
                publication["Description"]
            )  # visualise the progress on a commandline
            sdg_occurences = {
            }  # accumulator for SDG Keywords found in a given document
            for n in range(num_keywords):
                sdg_num = n + 1
                sdg = "SDG " + str(
                    sdg_num
                ) if sdg_num < num_keywords else "Misc"  # clean and process the string for documenting occurences
                sdg_occurences[sdg] = {"Word_Found": []}
                for keyword in keywords[n]:
                    if keyword in description:
                        sdg_occurences[sdg]["Word_Found"].append(keyword)
                if len(sdg_occurences[sdg]["Word_Found"]) == 0:
                    sdg_occurences.pop(sdg, None)  # clear out empty occurences

                resulting_data[doi] = {
                    "DOI": doi,
                    "Related_SDG": sdg_occurences
                }
        print()
        self.mongodb_pusher.matched_scopus(
            resulting_data)  # push the processed data to MongoDB
        print()
        # Record the same data locally, acts as a backup
        with open('main/NLP/STRING_MATCH/SDG_RESULTS/scopus_matches.json',
                  'w') as outfile:
            json.dump(resulting_data, outfile)

    def run(self):
        """
            Controller method for self class
            Loads modules from a pre-loaded pickle file
        """

        data = self.loader.load_all()
        self.__read_keywords(data)