def convert_fields(dict_data): config.log_message("Converting fields to Excel sheet...") dict_names = [] names = [] labels = [] types = [] constraints = [] descriptions = [] for dictionary in dict_data: for field in dictionary['fields']: dict_names.append(dictionary['code']) names.append(field['name']) labels.append(field['label']) types.append(field['type']) constraints.append(field['constraints']) descriptions.append(field['description']) data = { 'dictionary_code': dict_names, 'name': names, 'label': labels, 'type': types, 'constraints': constraints, 'description': descriptions } df = pd.DataFrame(data=data) config.log_message("Done!") return df
def create_catalogue_section(): config.log_message("Converting the catalogue to JSON...") pd = read_sheet('catalogue',{'key':str,'value':str}) pd=pd.dropna() result = pd.to_dict(orient="records") supported_values=['title','description','creator','contactPoint','license','versionInfo','keyword','identifier','rights','publisher_name','publisher_url'] catalogue_dict={} publisher={} for row in result: if row['key'] in supported_values: # Those that need more work to get into required structure if row['key']=='publisher_name': publisher['name']=row['value'] elif row['key']=='publisher_url': publisher['url']=row['value'] elif row['key']=='keyword': catalogue_dict['keyword']=row['value'].split(",") else: catalogue_dict[row['key']]=row['value'] catalogue_dict["publisher"]=publisher config.log_message("Done!") return catalogue_dict
def create_json_structure(): config.log_message("Converting Excel to FAIR JSON...") json_dict=create_configuration_section() json_dict['catalogue']=create_catalogue_section() json_dict['dictionaries']=create_dictionary_section() json_content=json.dumps(json_dict, indent=4) f = open(script_args.output_file, "a") f.write(json_content) f.close() config.log_message("Conversion Complete. The FAIR JSON can be found in the file '" + str(script_args.output_file)+"'")
def write_to_excel(dataframes): config.log_message("Writing Excel sheets...") writer = pd.ExcelWriter(script_args.output_file, engine='xlsxwriter') for df in dataframes: dataframes[df].to_excel(writer, sheet_name=df, index=False) writer.save() config.log_message( "Conversion Complete. The Excel file can be found at '" + str(script_args.output_file) + "'")
def convert_configuration(configuration_data): config.log_message("Converting configuration to Excel sheet...") keys = [] values = [] for key in configuration_data: if key in ('visibility', 'workflow_key', 'code'): keys.append(key) values.append(configuration_data[key]) data = {'key': keys, 'value': values} df = pd.DataFrame(data=data) config.log_message("Done!") return df
def create_configuration_section(): config.log_message("Converting the configuration to JSON...") pd = read_sheet('configuration',{'key':str,'value':str}) pd=pd.dropna() result = pd.to_dict(orient="records") supported_values=['visibility','workflow_key','code'] configuration_dict={} for row in result: if row['key'] in supported_values: configuration_dict[row['key']]=row['value'] config.log_message("Done!") return configuration_dict
def create_lookups_json(lookup_name, lookup_type): config.log_message("-- Converting lookup: '" + lookup_name + "' to JSON...") pd = read_sheet('lookups',{'lookup':str,'name':str,'description':str}) result = pd.to_dict(orient="records") lookups_dict={} options_arr=[] lookups_dict['type']=lookup_type lookups_dict['options']=options_arr for row in result: if row['lookup']==lookup_name: options_dict={} options_dict['name']=row['name'] options_dict['description']=str(row['description']) options_arr.append(options_dict) return lookups_dict
def convert_dictionaries(dict_data): config.log_message("Converting dictionaries to Excel sheet...") codes = [] names = [] descriptions = [] for dictionary in dict_data: codes.append(dictionary['code']) names.append(dictionary['name']) descriptions.append(dictionary['description']) data = {'code': codes, 'name': names, 'description': descriptions} df = pd.DataFrame(data=data) config.log_message("Done!") return df
def process_report(text_content, html_content): global server_connected global server if config.config['method'] == 'mail': if config.config['mail_format'] == 'html': message = MIMEText(html_content, 'html', 'utf-8') else: message = MIMEText(text_content, 'plain', 'utf-8') message['Subject'] = 'Movies torrents digest' message['From'] = config.config['from'] message['To'] = config.config['to'] try: server = smtplib.SMTP(config.config['smtp_server']) server_connected = True except: config.log_message("Unexpected error while connecting to mail server:" + str(sys.exc_info()[0]), 'error') config.log_message("Printing report to console\n") config.console_log("\n") config.console_log(text_content) if server_connected: try: server.ehlo() server.starttls() server.login(config.config['username'], config.config['password']) server.sendmail(config.config['from'], config.config['to'], message.as_string()) server.quit() config.log_message("Report sent by mail") except: config.log_message("Unexpected error while sending mail :" + str(sys.exc_info()[0]), 'error') config.log_message("Printing report to console") config.console_log("\n") config.console_log(text_content) finally: server.close() else: config.console_log("\n") config.console_log(text_content)
def create_dictionary_section(): pd = read_sheet('dictionaries',{'code':str,'name':str,'description':str}) dictionaries = pd.to_dict(orient="records") dicts_arr=[] for row in dictionaries: config.log_message("Converting dictionary '" + row['name'] + "' to JSON...") fields_dict={} fields_dict['code']=row['code'] fields_dict['name']=row['name'] fields_dict['description']=row['description'] fields,lookups=create_fields_json(row['code']) fields_dict['fields']=fields fields_dict['lookups']=lookups dicts_arr.append(fields_dict) config.log_message("Done!") return dicts_arr
def convert_catalogue(catalogue_data): config.log_message("Converting catalogue to Excel sheet...") keys = [] values = [] for key in catalogue_data: if key == 'publisher': keys.append('publisher_name') values.append(catalogue_data[key]['name']) if 'url' in catalogue_data[key].keys(): keys.append('publisher_url') values.append(catalogue_data[key]['url']) elif key == 'keyword': keys.append('keyword') values.append(','.join(catalogue_data[key])) else: keys.append(key) values.append(catalogue_data[key]) data = {'key': keys, 'value': values} df = pd.DataFrame(data=data) config.log_message("Done!") return df
def convert_lookups(dict_data): config.log_message("Converting lookups to Excel sheet...") lookup_names = [] field_names = [] descriptions = [] for dictionary in dict_data: for lookup in dictionary['lookups']: for vocab in dictionary['lookups'][lookup]['options']: lookup_names.append(lookup) field_names.append(vocab['name']) descriptions.append(vocab['description']) data = { 'lookup': lookup_names, 'name': field_names, 'description': descriptions } df = pd.DataFrame(data=data) config.log_message("Done!") return df
def create_fields_json(dictionary_code): config.log_message("-- Converting fields to JSON...") pd = read_sheet('fields',{'dictionary_code':str,'name':str,'label':str,'type':str,'constraints':str,'description':str}) pd=pd.replace(np.nan,"null") result = pd.to_dict(orient="records") fields_arr=[] constraints_dict={} for row in result: if row['dictionary_code']==dictionary_code: fields_dict={} fields_dict['name']=row['name'] fields_dict['label']=row['label'] fields_dict['type']=row['type'] fields_dict['constraints']=row['constraints'] if row['constraints']!='null': constraints_dict[row['constraints']]=row['type'] fields_dict['description']=row['description'] fields_arr.append(fields_dict) lookups_dict={} for constraint, type in constraints_dict.items(): lookups_dict[constraint]=create_lookups_json(constraint, type) return fields_arr, lookups_dict
def parse_feed(): list_movie = dict() list_movie_discarded = dict() execution_log = StringIO() first_print = True for feed_url in feed_urls: results = parse(feed_url) for entry in results['entries']: torrent_title = entry['title'].strip() # Depending on feed source, link to torrent html page might be added. We're only interested in the link # to the torrent file itself, that should lie at the end of the list torrent_file_url = entry['links'][-1]['href'] # Delay console prints if user is prompted for configuration if config.status == 'init': execution_log.write("Processing : " + torrent_title + "\n") elif config.status == 'crash': sys.exit() else: if first_print: previous_logs = execution_log.getvalue() execution_log.close() if len(previous_logs) > 0: for line in previous_logs.splitlines(): config.log_message(line) first_print = False config.log_message("Processing " + torrent_title + "\n") pos = 0 properties = dict() has_title = False while pos < len(torrent_title)-1: data, pos, has_title = analyze_filename_content(torrent_title, pos, has_title) key = data[0].lower() value = data[1] value = re.sub(r'_|\.', r' ', value) if key in properties: if key in ['misc', 'tag', 'lan']: properties[key] += u" " + value.lstrip().strip() else: properties[key] = value.lstrip().strip() if entry.has_key("summary"): summary = entry['summary'] pos = 0 tmp_dict = dict() while pos < len(summary)-1: data, pos = analyze_summary_content(summary, pos) if data != (): key = data[0].lower() value = data[1] tmp_dict[key] = value.strip() if tmp_dict.has_key('rating') and tmp_dict.has_key('title') and tmp_dict.has_key('year'): # A torrent whose summary has all 3 information above is considered reliable. Consider these data # rather than data extracted from torrent's name for key, value in tmp_dict.iteritems(): properties[key] = value if 'title' in properties: if 'rip' in properties: if 'lan' not in properties or re.search(r'hindi|punjabi', properties['lan'].lower()) is None: key = comparing_title(properties['title']) if not key in list_movie: if not torrent_title in list_movie_discarded: get_imdb_info(properties) if properties['trust_imdb'] and properties['rating'] < 6.5: properties['discard'] = 'Bad IMDB rating : ' \ + str(properties['rating']) \ + ' - ' \ + properties['imdb_url'] else: properties['discard'] = 'Dummy text not used' else: properties['discard'] = 'Hindi movie' else: properties['discard'] = 'Not a rip' else: properties['discard'] = 'No title found in torrent\'s name' properties['title'] = torrent_title properties['torrent_title'] = torrent_title properties['torrent_file_url'] = torrent_file_url # Depending on feed source, size of torrent content might be stored in different keys try: byte_length = int(entry['torrent_contentlength']) except: byte_length = 0 try: byte_length = int(entry['contentlength']) if byte_length == 0 else byte_length except: byte_length = 0 try: byte_length = int(entry['size']) if byte_length == 0 else byte_length except: byte_length = 0 if byte_length != 0: mb = byte_length / (1024*1024) if mb > 1024: gb = round(mb/float(1024), 2) properties['size'] = (str(gb), 'GB') else: properties['size'] = (str(mb), 'MB') else: properties['size'] = None if not 'discard' in properties: key = comparing_title(properties['title']) if not key in list_movie: list_torrent = [] list_movie[key] = list_torrent list_movie[key].append(properties) else: if not torrent_title in list_movie_discarded: list_movie_discarded[torrent_title] = properties log_to_print = False while config.status == 'init': log_to_print = True sleep(1) if config.status == 'ok': if log_to_print: previous_logs = execution_log.getvalue() execution_log.close() if len(previous_logs) > 0: for line in previous_logs.splitlines(): config.log_message(line) else: sys.exit() html_content, text_content = mail_utils.format_report(list_movie, list_movie_discarded) mail_utils.process_report(text_content, html_content)