Esempio n. 1
0
 def get_weather(self):
     # get the location coordinates and sends a request to OWM api
     # returns a dic containing the weather data
     lat, lon = utils.get_location_coordinates(self.location)
     owmap_request = utils.make_owmap_request(lat, lon, self.part,
                                              self.units, self.api_key)
     weather = utils.send_request(owmap_request)
     utils.save_json_file(weather, self.res_file)
     return weather
Esempio n. 2
0
    def add_dialogue_file(self, jsonObject, fileName=None):
        """
        adds a new DialogueAnnotator
        """
        if not fileName:
            fileName = self.__get_new_file_id()
            self.filesAdded += 1

        save_json_file(obj=jsonObject, path=os.path.join(self.path, fileName))
        self.allFiles[MultiAnnotator.__GOLD_FILE_NAME].update_dialogues(
            jsonObject)

        self.allFiles[fileName] = DialogueAnnotator(self.path, fileName)
        self.save()
Esempio n. 3
0
def get_companies_data():
    # get companies indexes data and save to company_index.json
    url = 'http://eurocham-cambodia.org/members-directory'
    indexes = crawl_companies_indexes(url)
    filename = 'company_index'
    save_json_file(indexes, filename)

    with Pool(10) as p:  # Multiprocessing 10 in row
        profiles = [
            profile for profile in p.map(crawl_companies_profiles, indexes)
            if profile is not None
        ]
    profile_filename = 'company_profile'
    save_json_file(profiles, profile_filename)
    print('Crawled European Chamber of Commerce in Cambodia successfully')
Esempio n. 4
0
def serialize_and_save(entries, filename, use_serializer=True):
    if use_serializer:
        json_data = serialize_items(entries)
    else:
        # just dump it to json without any fancy stuff
        json_data = entries

    return save_json_file(data=json_data, filename=filename)
Esempio n. 5
0
def process_data():
    """
    Prepare the datasets with the concepts and modifiers, generate 
    and save the data in proper format to train the NER model 
    by matching the search terms of the entities in the texts
    of the reviews.
    """

    # Prepare the dataset of concepts
    # Load dataset in pandas DataFrame
    concepts_df = pd.read_excel(CONCEPTS_FILE_PATH, header=0)
    # Fill null values of column "Concept" with the previous value
    filled_df = fill_null_rows_with_previous_value(concepts_df, ['Concept'])
    # Remove rows with a null value in the column "Name"
    cleaned_df = remove_rows_with_null(filled_df, ['Name'])
    # Group the raw values of the column "Name" in a list by the column "Concept"
    grouped_df = group_columns_by_row(cleaned_df, 'Concept', 'Name')
    # Convert the DataFrame to a dictionary
    concepts_and_terms = grouped_df.to_dict()

    # Prepare the dataset of modifiers 
    # Load dataset in pandas DataFrame
    modifiers_df = pd.read_excel(MODIFIERS_FILE_PATH, header=0)
    # Get lists with adjectives and advebrs 
    adjectives = set(modifiers_df['ADJETIVOS'].to_list())
    adverbs = set(modifiers_df['ADVERBIOS'].dropna().to_list())
    # Get final list of modifiers 
    modifiers = get_modifiers(adjectives, adverbs)
    modifiers_and_terms = {"modifier": modifiers}

    # Get a dict with all entities and their search terms
    label_and_terms = dict(concepts_and_terms, **modifiers_and_terms)

    # Get the list of texts of the reviews
    reviews = get_json_from_file_path(CORPUS_FILE_PATH)
    print(f'Number of the reviews in the dataset: {len(reviews)}')

    # Get the data in proper format to train the NER model
    print('Generating the data for the NER model by matching the '
          'search terms of the entities in the texts of the reviews...')
    data = get_data(reviews, label_and_terms)
    print('Data in the proper format have been generated')

    # Save the data
    save_json_file(PROCESSED_DATA_PATH, data)
    print(f'Processed data saved in {PROCESSED_DATA_PATH}')
Esempio n. 6
0
    def set_file(self, filePath, fileName=None):
        """
        sets the file and tries to load it to use
        """
        self.__filePath = filePath

        if fileName:
            self.__fileName = fileName
            try:
                self.__dialogues = load_json_file(
                    os.path.join(self.__filePath, self.__fileName))
            except FileNotFoundError:
                save_json_file(obj=self.__dialogues,
                               path=os.path.join(self.__filePath,
                                                 self.__fileName))

        else:
            self.__fileName = DialogueAnnotator.__DEFAULT_FILENAME
Esempio n. 7
0
def serialize_and_save(entries, filename, use_serializer=True):
    if use_serializer:
        json_data = serialize_items(entries)
    else:
        # just dump it to json without any fancy stuff
        json_data = entries

    return save_json_file(
        data=json_data,
        filename=filename)
Esempio n. 8
0
def post(city=None, taxi_name=None):
    if city and taxi_name:
        filedata = utils.load_json_file('taxis.json')
        taxi_info = {}
        for taxi in filedata["data"]:
            if taxi["name"] == taxi_name:
                print("Found taxi!")
                taxi["state"] = "hired" if taxi["state"] == "free" else "free"
                print(taxi)
                taxi_info = taxi
        utils.save_json_file(filedata, 'taxis.json')
        response = {
            "meta":{
                "count":1,
                "links":{
                    "self":"https://mock-travel-apis.herokuapp.com/taxis/"+taxi_info["city"]+"/"+taxi_info["name"]
                },
            },
            "data":taxi_info
        }
        return json.dumps(response)
    else:
        return "Could not book taxi"
Esempio n. 9
0
 def dump(self):
     save_json_file(self.file_path, self.data)
Esempio n. 10
0
 def save_spell(self, fn):
     utils.save_json_file(self.spell_dict, fn)
def main():
    # -------------------------------
    #         PARSE ARGUMENTS
    # -------------------------------
    arg_names = ['command', 'dataset_name', 'snapshot_num']
    if len(sys.argv) != 3:
        print("Please check the arguments.\n")
        print("Example usage:")
        print("python ./.../preprocess_dataset.py Twitter16 3")
        exit()
    args = dict(zip(arg_names, sys.argv))
    dataset, snapshot_num = args['dataset_name'], int(args['snapshot_num'])
    print_dict(args)

    paths = {}
    if dataset in ['Twitter15', 'Twitter16']:
        # --------------------------
        #         INIT PATHS
        # --------------------------
        # Input
        paths['raw'] = './data/raw/rumor_detection_acl2017/'
        paths['raw_label'] = os.path.join(paths['raw'], dataset.lower(),
                                          'label.txt')
        paths['raw_tree'] = os.path.join(paths['raw'], dataset.lower(),
                                         'tree/')
        paths['resource_label'] = './resources/{0}/{0}_label_all.txt'.format(
            dataset)
        paths[
            'resource_tree'] = './resources/{0}/data.TD_RvNN.vol_5000.txt'.format(
                dataset)
        # Output (timestamp, index)
        paths[
            'timestamps_raw'] = './data/timestamps/{}/timestamps_raw.txt'.format(
                dataset)
        paths['timestamps'] = './data/timestamps/{}/timestamps.txt'.format(
            dataset)
        paths[
            'sequential_snapshots'] = './data/timestamps/{}/sequential_snapshots_{:02}.txt'.format(
                dataset, snapshot_num)
        paths[
            'temporal_snapshots'] = './data/timestamps/{}/temporal_snapshots_{:02}.txt'.format(
                dataset, snapshot_num)
        print_dict(paths)

        # --------------------------------------
        #         RAW / RESOURCE DATASET
        # --------------------------------------
        raw = {
            'id_label_dict': None,
            'label_id_dict': None,
            'trees_dict': None,
        }
        resource = {
            'id_label_dict': None,
            'label_id_dict': None,
            'trees_dict': None,
        }
        raw['id_label_dict'], _ = load_raw_labels(paths['raw_label'])
        resource['id_label_dict'], _ = load_resource_labels(
            paths['resource_label'])
        resource['trees_dict'] = load_resource_trees(paths['resource_tree'])

        temporal_info = raw_tree_to_timestamps(paths['raw_tree'],
                                               paths['timestamps'])
        save_json_file(paths['timestamps_raw'], temporal_info)
        # temporal_info = load_json_file(paths['timestamps_raw'])  # cache
        temporal_info = retrieve_temporal_info(temporal_info, resource)
        save_json_file(paths['timestamps'], temporal_info)
        edge_index = sequence_to_snapshot_index(temporal_info, snapshot_num)
        save_json_file(paths['sequential_snapshots'], edge_index)
        edge_index = temporal_to_snapshot_index(temporal_info, snapshot_num)
        save_json_file(paths['temporal_snapshots'], edge_index)

    elif dataset in ['Weibo']:
        # --------------------------
        #         INIT PATHS
        # --------------------------
        paths['resource_label'] = './resources/{0}/weibo_id_label.txt'.format(
            dataset)
        paths['resource_tree'] = './resources/{0}/weibotree.txt'.format(
            dataset)
        paths['timestamps'] = './data/timestamps/{}/timestamps.txt'.format(
            dataset)
        paths[
            'sequential_snapshots'] = './data/timestamps/{}/sequential_snapshots_{:02}.txt'.format(
                dataset, snapshot_num)

        # --------------------------------
        #         RESOURCE DATASET
        # --------------------------------
        resource = {
            'id_label_dict': None,
            'label_id_dict': None,
            'trees_dict': None,
        }
        resource['id_label_dict'], _ = load_resource_labels_weibo(
            paths['resource_label'])
        resource['trees_dict'] = load_resource_trees_weibo(
            paths['resource_tree'])

        sequential_info = retrieve_sequential_info_weibo(resource)
        save_json_file(paths['timestamps'], sequential_info)
        edge_index = sequence_to_snapshot_index(sequential_info, snapshot_num)
        save_json_file(paths['sequential_snapshots'], edge_index)

    elif dataset in ['Pheme']:  # TODO:
        # --------------------------
        #         INIT PATHS
        # --------------------------
        paths[
            'resource_label'] = './resources/{0}/pheme-label_balance.txt'.format(
                dataset)
        paths['resource_tree'] = './resources/{0}/pheme.vol_5000.txt'.format(
            dataset)
        paths['timestamps'] = './data/timestamps/{}/timestamps.txt'.format(
            dataset)
        paths[
            'sequential_snapshots'] = './data/timestamps/{}/sequential_snapshots_{:02}.txt'.format(
                dataset, snapshot_num)

        # --------------------------------
        #         RESOURCE DATASET
        # --------------------------------
        resource = {
            'id_label_dict': None,
            'label_id_dict': None,
            'trees_dict': None,
        }
        resource['id_label_dict'], _ = load_resource_labels_weibo(
            paths['resource_label'])
        resource['trees_dict'] = load_resource_trees_weibo(
            paths['resource_tree'])

        sequential_info = retrieve_sequential_info_weibo(resource)
        save_json_file(paths['timestamps'], sequential_info)
        edge_index = sequence_to_snapshot_index(sequential_info, snapshot_num)
        save_json_file(paths['sequential_snapshots'], edge_index)

    else:
        print("Please check the dataset name.\n")
        print("E.g. Twitter15, Twitter16, Weibo")
        exit()
Esempio n. 12
0
    if offset != 0:
        document['text'] += ' '
        offset += 1
    document['text'] += sentence['text']
    correct_spans(sentence, offset)
    document['entities'] += sentence['entities']


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--conll_file', help='')
    parser.add_argument('--save_to', help='')
    parser.add_argument('--jsonl', action='store_true', help='')
    parser.add_argument('--document_ids', type=str, help='')
    args = parser.parse_args()

    data = {}
    with open(args.document_ids,
              encoding='utf-8') as document_ids_input_stream:
        for document_sentence, document_id in zip(
                iter_sentences(args.conll_file), document_ids_input_stream):
            if document_id not in data:
                data[document_id] = {
                    'document_id': document_id,
                    'text': '',
                    'entities': []
                }
            append_sentence_to_document(data[document_id], document_sentence)
    data = [document for document_id, document in data.items()]
    save_json_file(data, args.save_to, args.jsonl)
def save_data(logreg_model, std_scaler, df_training_data, qid_to_class,
              class_to_qid, all_docs_kb, data_kb_with_vectors, args):
    """Saves the new model in models/KB_id_{KB_id} with the scaler, 
       the training data, the ref-tables and the data_kb_with_vectors
    """

    path = Path(ROOT)
    KB_id = args.KB_id

    path_live = path / "models" / f"KB_id_{KB_id}" / "live"
    path_live.mkdir(parents=True, exist_ok=True)

    path_archive = path / "models" / f"KB_id_{KB_id}" / "archive"
    path_archive.mkdir(parents=True, exist_ok=True)

    # check if path_live is empty
    files = os.listdir(path_live)

    if files:
        # move file to path_archive
        for f in files:
            path_archive_new = path_archive / datetime.now().strftime(
                '%d_%m_%Y_time_%H_%M_%S')
            path_archive_new.mkdir(parents=True, exist_ok=True)

            shutil.move(src=str((path_live / f)), dst=str(path_archive_new))
        # append the time being archived
        with open(path_archive_new / "logs", "a") as fp:
            fp.write(
                f"\nArchived: {datetime.now().strftime('%d_%m_%Y_time_%H_%M_%S')}"
            )

    # save training data
    df_training_data.to_csv(path_live / TRAINING_DATA_FILE, sep=";")

    # save all_docs_kb
    save_json_file(all_docs_kb, path_live / args.filepath_json)
    # save data_kb_with_vector
    save_pickle_dict(data_kb_with_vectors,
                     path_live / DATA_KB_WITH_VECTORS_FILE)

    # save reference dictionaries
    save_pickle_dict(qid_to_class, path_live / "qid_to_class.pkl")
    save_pickle_dict(class_to_qid, path_live / "class_to_qid.pkl")

    # save scaler
    save_pickle_dict(std_scaler, path_live / "std_scaler.pkl")
    dump(logreg_model, open(path_live / "logreg_model.joblib", "wb"))

    # save a logs file
    with open(path_live / "logs", "a") as fp:
        fp.write(
            f"Went live at: {datetime.now().strftime('%d_%m_%Y_time_%H_%M_%S')}"
        )

    # saves config file of how the model was created
    configfile_name = path_live / "config.cfg"
    # Check if there is already a configurtion file
    if not os.path.isfile(configfile_name):
        # Create the configuration file as it doesn't exist yet
        cfgfile = open(configfile_name, 'w')

        # Add content to the file
        Config = configparser.ConfigParser()
        Config.set(configparser.DEFAULTSECT, 'without_stopwords',
                   str(args.without_stopwords))
        Config.set(configparser.DEFAULTSECT, 'num_of_sentences',
                   str(args.num_of_sentences))
        Config.set(configparser.DEFAULTSECT, 'all_docs_kb_filename',
                   str(args.filepath_json))
        Config.write(cfgfile)
        cfgfile.close()

    return
Esempio n. 14
0
 def save(self):
     """
     Save the dialogues dictionary
     """
     save_json_file(obj=self.__dialogues,
                    path=os.path.join(self.__filePath, self.__fileName))
Esempio n. 15
0
def serialize_and_save(entries, filename):
    json_data = serialize_items(entries)

    return save_json_file(
        data=json_data,
        filename=filename)