def get_weather(self): # get the location coordinates and sends a request to OWM api # returns a dic containing the weather data lat, lon = utils.get_location_coordinates(self.location) owmap_request = utils.make_owmap_request(lat, lon, self.part, self.units, self.api_key) weather = utils.send_request(owmap_request) utils.save_json_file(weather, self.res_file) return weather
def add_dialogue_file(self, jsonObject, fileName=None): """ adds a new DialogueAnnotator """ if not fileName: fileName = self.__get_new_file_id() self.filesAdded += 1 save_json_file(obj=jsonObject, path=os.path.join(self.path, fileName)) self.allFiles[MultiAnnotator.__GOLD_FILE_NAME].update_dialogues( jsonObject) self.allFiles[fileName] = DialogueAnnotator(self.path, fileName) self.save()
def get_companies_data(): # get companies indexes data and save to company_index.json url = 'http://eurocham-cambodia.org/members-directory' indexes = crawl_companies_indexes(url) filename = 'company_index' save_json_file(indexes, filename) with Pool(10) as p: # Multiprocessing 10 in row profiles = [ profile for profile in p.map(crawl_companies_profiles, indexes) if profile is not None ] profile_filename = 'company_profile' save_json_file(profiles, profile_filename) print('Crawled European Chamber of Commerce in Cambodia successfully')
def serialize_and_save(entries, filename, use_serializer=True): if use_serializer: json_data = serialize_items(entries) else: # just dump it to json without any fancy stuff json_data = entries return save_json_file(data=json_data, filename=filename)
def process_data(): """ Prepare the datasets with the concepts and modifiers, generate and save the data in proper format to train the NER model by matching the search terms of the entities in the texts of the reviews. """ # Prepare the dataset of concepts # Load dataset in pandas DataFrame concepts_df = pd.read_excel(CONCEPTS_FILE_PATH, header=0) # Fill null values of column "Concept" with the previous value filled_df = fill_null_rows_with_previous_value(concepts_df, ['Concept']) # Remove rows with a null value in the column "Name" cleaned_df = remove_rows_with_null(filled_df, ['Name']) # Group the raw values of the column "Name" in a list by the column "Concept" grouped_df = group_columns_by_row(cleaned_df, 'Concept', 'Name') # Convert the DataFrame to a dictionary concepts_and_terms = grouped_df.to_dict() # Prepare the dataset of modifiers # Load dataset in pandas DataFrame modifiers_df = pd.read_excel(MODIFIERS_FILE_PATH, header=0) # Get lists with adjectives and advebrs adjectives = set(modifiers_df['ADJETIVOS'].to_list()) adverbs = set(modifiers_df['ADVERBIOS'].dropna().to_list()) # Get final list of modifiers modifiers = get_modifiers(adjectives, adverbs) modifiers_and_terms = {"modifier": modifiers} # Get a dict with all entities and their search terms label_and_terms = dict(concepts_and_terms, **modifiers_and_terms) # Get the list of texts of the reviews reviews = get_json_from_file_path(CORPUS_FILE_PATH) print(f'Number of the reviews in the dataset: {len(reviews)}') # Get the data in proper format to train the NER model print('Generating the data for the NER model by matching the ' 'search terms of the entities in the texts of the reviews...') data = get_data(reviews, label_and_terms) print('Data in the proper format have been generated') # Save the data save_json_file(PROCESSED_DATA_PATH, data) print(f'Processed data saved in {PROCESSED_DATA_PATH}')
def set_file(self, filePath, fileName=None): """ sets the file and tries to load it to use """ self.__filePath = filePath if fileName: self.__fileName = fileName try: self.__dialogues = load_json_file( os.path.join(self.__filePath, self.__fileName)) except FileNotFoundError: save_json_file(obj=self.__dialogues, path=os.path.join(self.__filePath, self.__fileName)) else: self.__fileName = DialogueAnnotator.__DEFAULT_FILENAME
def serialize_and_save(entries, filename, use_serializer=True): if use_serializer: json_data = serialize_items(entries) else: # just dump it to json without any fancy stuff json_data = entries return save_json_file( data=json_data, filename=filename)
def post(city=None, taxi_name=None): if city and taxi_name: filedata = utils.load_json_file('taxis.json') taxi_info = {} for taxi in filedata["data"]: if taxi["name"] == taxi_name: print("Found taxi!") taxi["state"] = "hired" if taxi["state"] == "free" else "free" print(taxi) taxi_info = taxi utils.save_json_file(filedata, 'taxis.json') response = { "meta":{ "count":1, "links":{ "self":"https://mock-travel-apis.herokuapp.com/taxis/"+taxi_info["city"]+"/"+taxi_info["name"] }, }, "data":taxi_info } return json.dumps(response) else: return "Could not book taxi"
def dump(self): save_json_file(self.file_path, self.data)
def save_spell(self, fn): utils.save_json_file(self.spell_dict, fn)
def main(): # ------------------------------- # PARSE ARGUMENTS # ------------------------------- arg_names = ['command', 'dataset_name', 'snapshot_num'] if len(sys.argv) != 3: print("Please check the arguments.\n") print("Example usage:") print("python ./.../preprocess_dataset.py Twitter16 3") exit() args = dict(zip(arg_names, sys.argv)) dataset, snapshot_num = args['dataset_name'], int(args['snapshot_num']) print_dict(args) paths = {} if dataset in ['Twitter15', 'Twitter16']: # -------------------------- # INIT PATHS # -------------------------- # Input paths['raw'] = './data/raw/rumor_detection_acl2017/' paths['raw_label'] = os.path.join(paths['raw'], dataset.lower(), 'label.txt') paths['raw_tree'] = os.path.join(paths['raw'], dataset.lower(), 'tree/') paths['resource_label'] = './resources/{0}/{0}_label_all.txt'.format( dataset) paths[ 'resource_tree'] = './resources/{0}/data.TD_RvNN.vol_5000.txt'.format( dataset) # Output (timestamp, index) paths[ 'timestamps_raw'] = './data/timestamps/{}/timestamps_raw.txt'.format( dataset) paths['timestamps'] = './data/timestamps/{}/timestamps.txt'.format( dataset) paths[ 'sequential_snapshots'] = './data/timestamps/{}/sequential_snapshots_{:02}.txt'.format( dataset, snapshot_num) paths[ 'temporal_snapshots'] = './data/timestamps/{}/temporal_snapshots_{:02}.txt'.format( dataset, snapshot_num) print_dict(paths) # -------------------------------------- # RAW / RESOURCE DATASET # -------------------------------------- raw = { 'id_label_dict': None, 'label_id_dict': None, 'trees_dict': None, } resource = { 'id_label_dict': None, 'label_id_dict': None, 'trees_dict': None, } raw['id_label_dict'], _ = load_raw_labels(paths['raw_label']) resource['id_label_dict'], _ = load_resource_labels( paths['resource_label']) resource['trees_dict'] = load_resource_trees(paths['resource_tree']) temporal_info = raw_tree_to_timestamps(paths['raw_tree'], paths['timestamps']) save_json_file(paths['timestamps_raw'], temporal_info) # temporal_info = load_json_file(paths['timestamps_raw']) # cache temporal_info = retrieve_temporal_info(temporal_info, resource) save_json_file(paths['timestamps'], temporal_info) edge_index = sequence_to_snapshot_index(temporal_info, snapshot_num) save_json_file(paths['sequential_snapshots'], edge_index) edge_index = temporal_to_snapshot_index(temporal_info, snapshot_num) save_json_file(paths['temporal_snapshots'], edge_index) elif dataset in ['Weibo']: # -------------------------- # INIT PATHS # -------------------------- paths['resource_label'] = './resources/{0}/weibo_id_label.txt'.format( dataset) paths['resource_tree'] = './resources/{0}/weibotree.txt'.format( dataset) paths['timestamps'] = './data/timestamps/{}/timestamps.txt'.format( dataset) paths[ 'sequential_snapshots'] = './data/timestamps/{}/sequential_snapshots_{:02}.txt'.format( dataset, snapshot_num) # -------------------------------- # RESOURCE DATASET # -------------------------------- resource = { 'id_label_dict': None, 'label_id_dict': None, 'trees_dict': None, } resource['id_label_dict'], _ = load_resource_labels_weibo( paths['resource_label']) resource['trees_dict'] = load_resource_trees_weibo( paths['resource_tree']) sequential_info = retrieve_sequential_info_weibo(resource) save_json_file(paths['timestamps'], sequential_info) edge_index = sequence_to_snapshot_index(sequential_info, snapshot_num) save_json_file(paths['sequential_snapshots'], edge_index) elif dataset in ['Pheme']: # TODO: # -------------------------- # INIT PATHS # -------------------------- paths[ 'resource_label'] = './resources/{0}/pheme-label_balance.txt'.format( dataset) paths['resource_tree'] = './resources/{0}/pheme.vol_5000.txt'.format( dataset) paths['timestamps'] = './data/timestamps/{}/timestamps.txt'.format( dataset) paths[ 'sequential_snapshots'] = './data/timestamps/{}/sequential_snapshots_{:02}.txt'.format( dataset, snapshot_num) # -------------------------------- # RESOURCE DATASET # -------------------------------- resource = { 'id_label_dict': None, 'label_id_dict': None, 'trees_dict': None, } resource['id_label_dict'], _ = load_resource_labels_weibo( paths['resource_label']) resource['trees_dict'] = load_resource_trees_weibo( paths['resource_tree']) sequential_info = retrieve_sequential_info_weibo(resource) save_json_file(paths['timestamps'], sequential_info) edge_index = sequence_to_snapshot_index(sequential_info, snapshot_num) save_json_file(paths['sequential_snapshots'], edge_index) else: print("Please check the dataset name.\n") print("E.g. Twitter15, Twitter16, Weibo") exit()
if offset != 0: document['text'] += ' ' offset += 1 document['text'] += sentence['text'] correct_spans(sentence, offset) document['entities'] += sentence['entities'] if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--conll_file', help='') parser.add_argument('--save_to', help='') parser.add_argument('--jsonl', action='store_true', help='') parser.add_argument('--document_ids', type=str, help='') args = parser.parse_args() data = {} with open(args.document_ids, encoding='utf-8') as document_ids_input_stream: for document_sentence, document_id in zip( iter_sentences(args.conll_file), document_ids_input_stream): if document_id not in data: data[document_id] = { 'document_id': document_id, 'text': '', 'entities': [] } append_sentence_to_document(data[document_id], document_sentence) data = [document for document_id, document in data.items()] save_json_file(data, args.save_to, args.jsonl)
def save_data(logreg_model, std_scaler, df_training_data, qid_to_class, class_to_qid, all_docs_kb, data_kb_with_vectors, args): """Saves the new model in models/KB_id_{KB_id} with the scaler, the training data, the ref-tables and the data_kb_with_vectors """ path = Path(ROOT) KB_id = args.KB_id path_live = path / "models" / f"KB_id_{KB_id}" / "live" path_live.mkdir(parents=True, exist_ok=True) path_archive = path / "models" / f"KB_id_{KB_id}" / "archive" path_archive.mkdir(parents=True, exist_ok=True) # check if path_live is empty files = os.listdir(path_live) if files: # move file to path_archive for f in files: path_archive_new = path_archive / datetime.now().strftime( '%d_%m_%Y_time_%H_%M_%S') path_archive_new.mkdir(parents=True, exist_ok=True) shutil.move(src=str((path_live / f)), dst=str(path_archive_new)) # append the time being archived with open(path_archive_new / "logs", "a") as fp: fp.write( f"\nArchived: {datetime.now().strftime('%d_%m_%Y_time_%H_%M_%S')}" ) # save training data df_training_data.to_csv(path_live / TRAINING_DATA_FILE, sep=";") # save all_docs_kb save_json_file(all_docs_kb, path_live / args.filepath_json) # save data_kb_with_vector save_pickle_dict(data_kb_with_vectors, path_live / DATA_KB_WITH_VECTORS_FILE) # save reference dictionaries save_pickle_dict(qid_to_class, path_live / "qid_to_class.pkl") save_pickle_dict(class_to_qid, path_live / "class_to_qid.pkl") # save scaler save_pickle_dict(std_scaler, path_live / "std_scaler.pkl") dump(logreg_model, open(path_live / "logreg_model.joblib", "wb")) # save a logs file with open(path_live / "logs", "a") as fp: fp.write( f"Went live at: {datetime.now().strftime('%d_%m_%Y_time_%H_%M_%S')}" ) # saves config file of how the model was created configfile_name = path_live / "config.cfg" # Check if there is already a configurtion file if not os.path.isfile(configfile_name): # Create the configuration file as it doesn't exist yet cfgfile = open(configfile_name, 'w') # Add content to the file Config = configparser.ConfigParser() Config.set(configparser.DEFAULTSECT, 'without_stopwords', str(args.without_stopwords)) Config.set(configparser.DEFAULTSECT, 'num_of_sentences', str(args.num_of_sentences)) Config.set(configparser.DEFAULTSECT, 'all_docs_kb_filename', str(args.filepath_json)) Config.write(cfgfile) cfgfile.close() return
def save(self): """ Save the dialogues dictionary """ save_json_file(obj=self.__dialogues, path=os.path.join(self.__filePath, self.__fileName))
def serialize_and_save(entries, filename): json_data = serialize_items(entries) return save_json_file( data=json_data, filename=filename)