def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf', default="rake") parser.add_argument('-c', '--corpora', help='Two Corpora to operate on ', nargs='+', default=['state_of_the_union', 'abstract']) parser.add_argument('-k', '--top_k', help='number of elements for output', type=int, default=100) args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args chosen_corpora = ['sustainability', 'bundestag'] # args['corpora'] algorithm = "rake" # args['algorithm'] top_k = 100 # args['top_k'] yearwise = True evaluate_single(config, algorithm, chosen_corpora, top_k, use_unassigned=True, yearwise=yearwise)
def finish_simulation(simulation_setting, repetitions, logDir): analyzers = [rep['analyzer'] for rep in repetitions] cnf.save_config(simulation_setting, logDir + 'settings.yaml') # write graph metrics to csv for ind, analyser in enumerate(analyzers): analyser.write(logDir + 'graph_' + str(ind) + '.csv') # build mean and std over all analyzers metrics_mean = [] metrics_std = [] metrics_mean.append(analyzers[0].results['Version']) metrics_std.append(analyzers[0].results['Version']) for metric in analyzers[0].metrics: if metric.getMetricName() is not 'Version': metric_combined = np.array([ analyser.results[metric.getMetricName()] for analyser in analyzers ]) # a row is an analyzer metrics_mean.append(np.mean(metric_combined, axis=0)) metrics_std.append(np.std(metric_combined, axis=0)) for suffix, contents in zip(['mean', 'std'], [metrics_mean, metrics_std]): combinedCsv = csv.writer( open(logDir + 'metrics_' + suffix + '.csv', 'w')) combinedCsv.writerow( [metric.getMetricName() for metric in analyzers[0].metrics]) for i in range(len(analyzers[0].results['Version'])): row = [] for row_contents in contents: row.append(row_contents[i]) combinedCsv.writerow(row) mean = { metric.getMetricName(): metrics_mean[i] for i, metric in enumerate(analyzers[0].metrics) } std = { metric.getMetricName(): metrics_std[i] for i, metric in enumerate(analyzers[0].metrics) } return { 'mean': mean, 'std': std, }
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf', default="rake") parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['state_of_the_union']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args chosen_corpora = [ # 'state_of_the_union', 'bundestag', 'abstract', 'sustainability' ] # args['corpora'] PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] for corpus, path_meta in zip(corpora, paths_and_meta_data): corpus.save_corpus_without_text(modify_path(path_meta.path))
def main(): # load configuration parameters from config file config = ConfigLoader.get_config() # deletes unusable documents and replaces date with year int # cleaning_abstracts(config, overwrite=False) # cleaning_sustainability(config, overwrite=False) # cleaning_bundestag(config, overwrite=True) # # cleaning_authors(config, overwrite=True) cleaning_un(config, overwrite=False)
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['bundestag', 'abstract']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() chosen_corpora = args['corpora'] PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] print(f'Yearwise of {chosen_corpora}') corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora] for corpus, path_meta in zip(corpora, paths_and_meta_data): corpus.save_corpus(modify_path(path_meta.path)) corpus.save_corpus_without_text( modify_path(path_meta.path, without_text=True))
def main(): parser = argparse.ArgumentParser( description='Translates keywords in keyword files of given paths') parser.add_argument( '-p', '--paths', help='Paths of keyword files to translate', nargs='+', default=['data/bundestag_corpus_rake_keywords_yearwise.json']) args = vars(parser.parse_args()) # -p data/bundestag_corpus_rake_keywords.json config = ConfigLoader.get_config() paths = args['paths'] cache_file = config["translator"]["cache_file"] google_client_secrets_file = config["translator"][ "google_client_secret_file"] translator = None timeout = None if google_client_secrets_file and not google_client_secrets_file == "": appflow = flow.InstalledAppFlow.from_client_secrets_file( google_client_secrets_file, scopes=['https://www.googleapis.com/auth/cloud-platform']) appflow.run_local_server() # launch browser # appflow.run_console() translator = g_translate.Client(credentials=appflow.credentials) else: # fallback translator = googletrans.Translator() cache = None timeout = timeout try: cache = load_cache_from_file(cache_file) except Exception as e: logging.warning("Loading of file failed") logging.warning(e) cache = {"de2en": {}, "en2de": {}} def iterate_keywords(data): tqdm_bar = tqdm(data.items(), total=len(data.keys())) for doc_id, keywords in tqdm_bar: if keywords: for keyword in keywords: en_translation = keyword["english_translation"] ger_translation = keyword["german_translation"] if en_translation is None: translated = translate(ger_translation, cache, translator, timeout, dest="en") keyword["english_translation"] = translated if ger_translation is None: translated = translate(en_translation, cache, translator, timeout, dest="de") keyword["german_translation"] = translated try: for path in paths: logging.debug(f'loading keywords at \"{path}\"') with open(path, encoding='utf-8') as f: data = json.load(f) logging.debug('translating keywords ...') iterate_keywords(data) logging.debug(f'saving keywords with translations at \"{path}\"') with open(path, "w", encoding='utf-8') as f: json.dump(data, f, indent=1, ensure_ascii=True) except KeyboardInterrupt: logging.debug('process was interrupted') finally: logging.debug('saving ...') save_cache_to_file(cache, cache_file)
import logging from typing import List, Union from flask import Flask, render_template, request, Response from utils import ConfigLoader, Corpus, Keyword, KeywordType, Language, KeywordTranslator, CorpusFilter from simple_statistics import yearwise_documents logging.basicConfig(level=logging.INFO) def modify_path(path: str, algorithm: str): return path.replace('.json', f'_{algorithm}.json') logging.info('importing corpora ...') config = ConfigLoader.get_config() corpus_data = {} keyword_data = {} min_year = 5000 max_year = 0 logging.info('importing corpora and keywords data ...') start_time = time.time() for corpus_name in config["corpora_for_viz"]: logging.info(corpus_name) with open(config["corpora_for_viz"][corpus_name]["corpus"]) as corpus_file: corpus_data[corpus_name] = json.load(corpus_file) with open(config["corpora_for_viz"][corpus_name] ["keywords"]) as keyword_file:
def main(): # load configuration parameters from config file config = ConfigLoader.get_config() # corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus") # corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus") corpus = Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus") # print(len(corpus)) # test = DocumentsFilter.filter(corpus, has_tags=['test']) # print(set([x.tags for x in test])) # print(len(test)) # # exit(0) corpus = corpus.get_n_documents_as_corpus(n=100) # build yearwise pseudo documents pseudo_corpus = corpus.year_wise_pseudo_documents() # extract keywords KeyPhraseExtractor.tfidf_skl(corpus=pseudo_corpus) print([d.keywords for d in pseudo_corpus.get_documents()]) KeyPhraseExtractor.rake(corpus=corpus) print([d.keywords for d in corpus.get_documents()]) # key_words_post = Document.group_keywords_year_wise(corpus) # key_words_pre = Document.transform_pseudo_docs_keywords_to_dict(KeyPhraseExtractor.rake(documents=pseudo_corpus)) # print(KeyPhraseExtractor.get_top_k_keywords(key_words_post_group, 10)) # print(KeyPhraseExtractor.get_top_k_keywords(key_words_pre_group, 10)) # format: {year->list fo keywords} kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"]) counter = 0 for doc in corpus.get_documents(): for keyword in doc.keywords: if counter > 100: break kwt.translate(keyword) print(keyword) counter += 1 break print('extracting keywords with rake ...') rake_keywords = KeyPhraseExtractor.rake(corpus=corpus.get_documents()[0]) rake_keywords_keys = list(rake_keywords.keys()) print('rake keywords dict keys:', rake_keywords_keys) kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"]) list_of_keywords = [] for k in rake_keywords[rake_keywords_keys[0]]: kw = Keyword(german_translation=k, keyword_type=KeywordType.RAKE) kwt.translate(kw) list_of_keywords.append(kw) print('{} \t {} \t\t\t {}'.format(kw.source_language, kw.english_translation, kw.german_translation))
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf_skl', default="rake") parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['bundestag']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args algorithm = args['algorithm'] translate_keywords = False #args['translate'] chosen_corpora = args['corpora'] assign_keywords = False yearwise = True PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] if yearwise: KeyPhraseExtractor.top_k = 1000 KeyPhraseExtractor.max_ngram = 3 use = { "rake": KeyPhraseExtractor.rake, "tfidf_skl": KeyPhraseExtractor.tfidf_skl, "tfidf_pke": KeyPhraseExtractor.tfidf_pke, "text_rank": KeyPhraseExtractor.text_rank, "text_rank_pke": KeyPhraseExtractor.text_rank_pke, "yake": KeyPhraseExtractor.yake_pke, "single_rank": KeyPhraseExtractor.single_rank_pke, "topic_rank": KeyPhraseExtractor.topic_rank_pke, "topical_page_rank": KeyPhraseExtractor.topical_page_rank_pke, "position_rank": KeyPhraseExtractor.position_rank_pke, "multipartite_rank": KeyPhraseExtractor.multipartite_rank_pke } keyword_extractor = use[algorithm] print( f'Applied {algorithm} on {chosen_corpora} with translation={translate_keywords}' ) corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] if yearwise: corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora] for corpus, path_meta in zip(corpora, paths_and_meta_data): if translate_keywords: kwt = KeywordTranslator( cache_file=config["translator"]["cache_file"]) corpus.translate_keywords(kwt) keyword_extractor(corpus=corpus) if assign_keywords: new_path = str(path_meta.path).replace('.json', f"_{algorithm}.json") corpus.save_corpus(new_path) else: new_path = str(path_meta.path).replace( '.json', f"_{algorithm}_keywords.json") if yearwise: new_path = str(new_path).replace('.json', f"_yearwise.json") keyword_storage = { doc_id: document.keywords for doc_id, document in corpus.documents.items() } with open(new_path, 'w', encoding='utf-8') as f: json.dump(keyword_storage, f, ensure_ascii=False, indent=1, default=lambda o: o.__dict__) print(f'wrote file {new_path}')
def main(): pool = mp.Pool() log.info("Loading Config.") settings = cnf.load_config() results = {} for simulation_setting in settings: simulation_dir = './experiment/' + simulation_setting['sim_name'] + '/' try: os.makedirs(simulation_dir) except (FileExistsError): pass cnf.save_config(simulation_setting, simulation_dir + 'settings.yaml') stepConfigs = cnf.get_iteration_steps(simulation_setting) results[simulation_setting['sim_name']] = { 'dir': simulation_dir, 'steps': [], } for ind, stepConfig in enumerate(stepConfigs): stepDir = simulation_dir + str(ind) + '/' try: os.mkdir(stepDir) except (FileExistsError): pass stepResult = pool.apply_async(run_simulation, args=(stepConfig.copy(), stepDir)) results[simulation_setting['sim_name']]['steps'].append({ 'settings': stepConfig, 'result': stepResult, 'stepDir': stepDir, }) pool.close() # monitor progress ready = False while not ready: all = sum([ step['settings']['sim_repetitions'] for sim in results.values() for step in sim['steps'] ]) finished = sum([ step['settings']['sim_repetitions'] for sim in results.values() for step in sim['steps'] if step['result'].ready() ]) print(str(finished) + ' of ' + str(all) + ' jobs finished') ready = (all <= finished) try: time.sleep(1) except: pass if sum([ not step['result'].successful() for sim in results.values() for step in sim['steps'] ]) > 0: log.error('an exception occurrent in a simulation') pool.join()
def test_filter(): config = ConfigLoader.get_config(relative_path="..") corpora = [ Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="sustainability_corpus"), Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.EN, name="sustainability_corpus"), Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus") ] for corpus in corpora: corpus = corpus.get_n_documents_as_corpus(200) # test text_contains test_text_words = ['climate', 'klima'] test = CorpusFilter.filter(corpus, text_contains_one_of=test_text_words) for t in test: is_incorporated = False for ttw in test_text_words: if ttw in t.text: is_incorporated = True if not is_incorporated: assert False # test is_one_of_languages test_date_range = range(2015,2016) test = CorpusFilter.filter(corpus, date_in_range=test_date_range) for t in test: if t.date not in test_date_range: assert False # test is_one_of_languages test_languages = ['english', 'en'] test = CorpusFilter.filter(corpus, is_one_of_languages=test_languages) for t in test: if t.language.lower() is not test_languages: assert False # test is_one_of_doc_ids test_doc_id = '0' test = CorpusFilter.filter(corpus, is_one_of_doc_ids=[test_doc_id]) for t in test: if test_doc_id is not t.doc_id: assert False # test has_authors test_author = 'test' test = CorpusFilter.filter(corpus, has_authors=[test_author]) for t in test: if test_author not in t.author: assert False # test has_tags test_tags = 'test' test = CorpusFilter.filter(corpus, has_tags=[test_tags]) for t in test: if test_tags not in t.tags: assert False # test is_one_of_parties test_parties = ["cdu", "FdP"] test = CorpusFilter.filter(corpus, is_one_of_parties=[test_parties]) for t in test: if t.party.lower() not in [x.lower() for x in test_parties]: assert False # test ratings_in_range test_rating_range = range(0, 7) test = CorpusFilter.filter(corpus, ratings_in_range=test_rating_range) for t in test: if t.rating not in test_rating_range: assert False # TODO: Test for keywords assert True
def main(): config = ConfigLoader.get_config() corpora = [ Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus"), Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus"), Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus"), Corpus(source=config["corpora"]["state_of_the_union_corpus"], language=Language.EN, name="state_of_the_union_corpus") ] # count_non_years(corpora[0]) # count_non_years(corpora[1]) # count_non_years(corpora[2]) # Results: non date vs useable date # abstract: 54135 / 261215, 1387 don't have a year as date but a string # bundestag: 0 / 877973 # sustainability 3 / 221034 print(document_number(corpora[0])) print(document_number(corpora[1])) print(document_number(corpora[2])) print(document_number(corpora[3])) print(token_number(corpora[0])) print(token_number(corpora[1])) print(token_number(corpora[2])) print(token_number(corpora[3])) # Results: token number # abstract: 59314582 # bundestag: 226300348 # sustainability: 52878146 yearwise_documents(corpora[0], aggregation_func=len) # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988, # 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, # 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018] # [1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3, 2, 1, 4, 14, 28, 47, 44, 124, 714, 962, 1080, 1143, 1513, 2104, 2341, # 2554, 2862, 2947, 3470, 3617, 4230, 4495, 4827, 5655, 6948, 8331, 10287, 11750, 14345, 16149, 19308, 20899, # 23429, 26201, 28937, 29835] yearwise_documents(corpora[0], aggregation_func=token_number) # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988, # 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, # 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018] # [237, 289, 26, 196, 299, 4, 2, 302, 13, 35, 163, 2, 513, 13, 3, 354, 2763, 5930, 10297, 9573, 20802, 124895, # 172925, 202836, 227647, 303919, 435539, 496060, 558721, 628000, 653111, 770258, 822043, 937258, 1009178, 1078762, # 1283970, 1593002, 1880724, 2268271, 2621783, 3192629, 3664511, 4406424, 4775594, 5367972, 6024271, # 6682090, 7080373] yearwise_documents(corpora[1], aggregation_func=len) # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, # 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, # 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, # 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019] # [1540, 7359, 7846, 7492, 6252, 5534, 5794, 7532, 6738, 4469, 4446, 7027, 5950, 7756, 8704, 12078, 13355, 14542, # 15855, 15673, 14876, 15917, 16901, 8760, 15082, 16343, 17110, 11914, 14095, 15597, 14811, 8937, 14207, 14647, # 9904, 16009, 19397, 16843, 10560, 16032, 16220, 11704, 14972, 14102, 17113, 11485, 16825, 17482, 13614, 9905, # 15310, 14208, 14124, 10926, 12884, 14305, 7757, 14210, 13508, 14408, 10609, 16643, 17751, 16497, 11335, 15374, # 14794, 13705, 5829, 17021, 9469] yearwise_documents(corpora[1], aggregation_func=token_number) # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, # 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, # 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, # 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019] # [612509, 2854944, 3061777, 3034065, 2113852, 2406060, 2380625, 2660021, 2460495, 2114953, 1715064, 2049805, # 1614656, 1634229, 1867580, 2135204, 2055406, 2452521, 2553521, 2575640, 2464189, 2675640, 2836025, 1644761, # 2665313, 3244912, 3004963, 2657335, 2751084, 2919374, 3366152, 2159773, 2722208, 3171091, 2280604, 3443955, # 3855233, 3566063, 2569335, 3565324, 4173720, 3067311, 3987509, 3832524, 4291976, 3145478, 4291797, 4338335, # 3925125, 3094547, 4464993, 4373147, 4392056, 3738766, 3946187, 4129635, 2350304, 4330315, 3983980, 4532271, # 3752798, 5167090, 5442241, 5468729, 3942007, 4846052, 4613129, 4046021, 1607377, 4583019, 2525648] yearwise_documents(corpora[2], aggregation_func=len) # [1986, 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, # 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018] # [1, 1, 39, 297, 476, 572, 749, 1017, 1117, 1327, 1479, 1673, 1953, 2072, 2246, 2762, 2971, 3593, 4149, 5313, 6234, # 7880, 9095, 10858, 12484, 15035, 17163, 20084, 23485, 29233, 35676] yearwise_documents(corpora[2], aggregation_func=token_number)
def main(): # parsing the command line arguments parser = argparse.ArgumentParser(prog=sys.argv[0], add_help=True) parser.add_argument('-g', '--game', default='ody') parser.add_argument('-b', '--batch_id', default=0) parser.add_argument('-e', '--env', default='local') parser.add_argument('-a', '--async_sort_process', default=1) # MMING parser.add_argument('-p', '--async_push', default=0) parser.add_argument('-s', '--sort_data', default=0) parser.add_argument('-f', '--process_file', default=1) parser.add_argument('-t', '--process_time', default=0) parser.add_argument('-c', '--cleanup', default=1) parser.add_argument('-j', '--job_id', default=-1) parser.add_argument('-d', '--start_ts', default=0) # retrieve the arguments args = vars(parser.parse_args(sys.argv[1:])) game = args['game'] batch_id = args['batch_id'] env = args['env'] async_sort_process = int(args['async_sort_process']) async_push = int(args['async_push']) sort_data = int(args['sort_data']) process_file = int(args['process_file']) process_time = int(args['process_time']) cleanup = int(args['cleanup']) job_id = int(args['job_id']) start_ts = int(args['start_ts']) # start the timer for the process timer = MZTimer(process_time) # get the config config = ConfigLoader(game, env, 'daily_snapshot').config message = "Dumped eco data from game: {}\n\n".format( time.strftime('%H:%M:%S', time.gmtime(process_time))) current_ts = TimeUtil.get_current_timestamp() if start_ts == 0: start_ts = current_ts current_time = TimeUtil.ts2str(current_ts) user = config['target']['user'] processing_dir = config['source']['processing_dir'] processed_dir = config['source']['processed_dir'] working_dir = config['source']['working_dir'] not_sent_dir = config['source']['not_sent_dir'] archive_temp_dir = config['target']['archive_tmp_dir'] target_archive_dir = config['target']['archive_dir'] clusters = config['target']['clusters'].split(',') job_ids = config['target']['job_ids'].split(',') default_cluster = clusters[0] target_temp_dir = '{}/temp_{}'.format(archive_temp_dir, job_id) daily_snapshot_temp_dir = construct_daily_snapshot_temp_dir_path( archive_temp_dir, start_ts) daily_snapshot_control_dir = construct_daily_snapshot_control_dir_path( archive_temp_dir, start_ts) pool = multiprocessing.Pool() # sanity check if job_id < 0: clean_up_source_files(working_dir) subject = "Invalid job_id [{} UTC]".format(current_time) EmailUtil.send_email(config['email']['alert'], subject, message) sys.exit(0) # sort and compress the files if process_file == 1: print 'Sorting and compressing the files...' prefixes = config['source']['prefixes'].split(',') res = True if async_sort_process == 1: res = pool.map( partial(sort_and_compress, game=game, batch_id=batch_id, job_id=job_id, start_ts=start_ts, sort_data=sort_data, processing_dir=processing_dir, working_dir=working_dir), prefixes) res = check_results(res) else: for prefix in prefixes: res = sort_and_compress(prefix, game, batch_id, job_id, start_ts, sort_data, processing_dir, working_dir) if not res: clean_up_source_files(working_dir) subject = "Error in sorting and compressing [{} UTC]".format( current_time) EmailUtil.send_email(config['email']['alert'], subject, message) sys.exit(0) timer.stop() message += "Sorted and Compressed files: {}\n\n".format( timer.sub_process_time_str) # send compressed files to archive server's temp print 'Sending processed files to archive server...' timer.sub_start() files = glob(os.path.join(working_dir, '*.gz')) hosts = config['target']['hosts'].split(',') results = {} for host in hosts: # create target temp dir if it does not exist on the archive server subprocess.call([ 'ssh', '{}@{}'.format(user, host), 'mkdir', '-p', target_temp_dir ]) if async_push == 1: results[host] = pool.map( partial(send_files, temp_dir=target_temp_dir, host=host, user=user), files) else: results[host] = [] for log_file in files: results[host].append( send_files(log_file, target_temp_dir, host, user)) timer.stop() message += "Pushed files to archive servers: {}\n\n".format( timer.sub_process_time_str) # move the files to aggregated (if all exit status are 0) or not_sent (otherwise) timer.sub_start() failed = False for (n, log_file) in enumerate(files): exit_status = max([results[host][n] for host in results]) if exit_status == 0: # successfully sent date = TimeUtil.get_date(current_ts) dest_dir = os.path.join(processed_dir, date) OSUtil.mkdir(dest_dir) shutil.move(log_file, dest_dir) else: # send failed; move working to not_sent directory failed = True failed_hosts = [host for host in results if results[host][n] != 0] for n, host in enumerate(failed_hosts): host_not_sent_dir = os.path.join(not_sent_dir, host) OSUtil.mkdir(host_not_sent_dir) if n == len(failed_hosts) - 1: # move it shutil.move(log_file, host_not_sent_dir) else: # copy it shutil.copy(log_file, host_not_sent_dir) if cleanup == 1: clean_up_source_files(processing_dir) if failed: subject = "[{}-ds] Error sending files to archive server. [{} UTC]".format( game, TimeUtil.get_current_time()) EmailUtil.send_email(config['email']['alert'], subject, message) sys.exit(0) # move all the files to the remote archive dir print "Moving files to final temp direcoty on archive servers..." timer.sub_start() for host in hosts: user_host = '{}@{}'.format(user, host) # create temp and control dirs if they do not exist subprocess.call( ['ssh', user_host, 'mkdir', '-p', daily_snapshot_temp_dir]) subprocess.call( ['ssh', user_host, 'mkdir', '-p', daily_snapshot_control_dir]) src = os.path.join(target_temp_dir, '*') dest = daily_snapshot_temp_dir + '/' print 'ssh', user_host, 'mv', src, dest subprocess.call(['ssh', user_host, 'mv', src, dest]) # mark single job success success_log_file_path = '{}/{}'.format( daily_snapshot_control_dir, construct_success_log_file_name(job_id)) print(success_log_file_path) subprocess.call([ 'ssh', user_host, 'echo ' + str(TimeUtil.get_current_timestamp()) + ' > ' + success_log_file_path ]) timer.stop() message += "Moved files to final temp dir: {}\n\n".format( timer.sub_process_time_str) # move the log files from the final temp to final destinations last_job = False for host in hosts: if are_all_jobs_completed(host, user, daily_snapshot_control_dir, job_ids): last_job = True timer.sub_start() # move files from the final temp to default cluster src = os.path.join(daily_snapshot_temp_dir, '*') default_cluster_temp_dir = construct_cluster_temp_dir( archive_temp_dir, default_cluster) subprocess.call([ 'ssh', '{}@{}'.format(user, host), 'mkdir', '-p', default_cluster_temp_dir ]) print 'ssh', user_host, 'mv', src, default_cluster_temp_dir subprocess.call( ['ssh', user_host, 'mv', src, default_cluster_temp_dir]) # copy files from the default cluster temp to other cluster temps for cluster in clusters: if cluster != default_cluster: cluster_temp_dir = construct_cluster_temp_dir( archive_temp_dir, cluster) subprocess.call([ 'ssh', '{}@{}'.format(user, host), 'mkdir', '-p', cluster_temp_dir ]) # copy files from first temp directory to others src = os.path.join(default_cluster_temp_dir, '*') print 'ssh', user_host, 'cp', src, cluster_temp_dir subprocess.call( ['ssh', user_host, 'cp', src, cluster_temp_dir]) # move files from each cluster temp to the cluster final destination for cluster in clusters: cluster_target_temp_dir = construct_cluster_temp_dir( archive_temp_dir, cluster) src = os.path.join(cluster_target_temp_dir, '*') cluster_target_archive_dir = target_archive_dir.format( cluster=cluster) dest = cluster_target_archive_dir + '/' print 'ssh', user_host, 'mv', src, dest subprocess.call(['ssh', user_host, 'mv', src, dest]) # clean up the success log subprocess.call([ 'ssh', user_host, 'rm -rf {}/*'.format(daily_snapshot_control_dir) ]) timer.stop() message += "Moved files to final destinations on {}: {}\n\n".format( host, timer.sub_process_time_str) message += "The whole process ran in {}.\n\n".format( timer.process_time_str) # send email out subject = "[{}] Successfully Sending Daily Snapshot Data. Job ID: {} [{} UTC]".format( game, job_id, TimeUtil.get_current_time()) if last_job: recipients = config['email']['success'] else: recipients = config['email']['sub_success'] EmailUtil.send_email(recipients, subject, message) sys.exit(0)