def copy_resources_from_data_folder_only_if_ready(p_root_path_data, p_config): is_ok_found = False root_found = '' folder_list = {} if not os.path.exists(p_root_path_data): print("error, the folder doesn't exist :", p_root_path_data) exit(-1) else: for root, dirs, files in os.walk(p_root_path_data): if is_ok_found and root_found in root: if (b_any("distance_map" in x for x in files) or b_any("_image" in x for x in files) or b_any("object_index" in x for x in files)): print("Currently : {}".format(root)) result_folder, array_path_images = subdivide_image( root, files, p_config) if result_folder not in folder_list: folder_list[result_folder] = [] folder_list[result_folder].append(array_path_images) else: folder_list[result_folder].append(array_path_images) else: if "OK.txt" in str(files): # The file is found it means that we can process images is_ok_found = True root_found = root else: is_ok_found = False prepare_data_for_learning(folder_list, p_config)
def eliminate_hidden_twins(hidden_twins_list, values): """Eliminate values using the naked twins strategy. Args: naked_twins (list): a list of lists containing each pair of hidden/naked twins. Example [[A1, B1], [D4, E5], [H4, C4]] values(dict): a dictionary of the form {'box_name': '123456789', ...} Returns: values(dict): the values dictionary with the naked twins eliminated from peers. """ try: logger.info("hidden_twins() INITIAL VALUES : " + str(values) + "\n") for i in range(len(_twins_list)): box1 = _twins_list[i][ 0] # From the previous example : box1 = naked_twins[0][0] --> A1 box2 = _twins_list[i][ 1] # From the previous example : box1 = naked_twins[0][0] --> B1 logger.info("box1 = " + box1 + "\n") logger.info("box2 = " + box2 + "\n") # Build a set with the common peers, the intersection of both peers common_peers = set(peers[box1]) & set(peers[box2]) logger.info("hidden_twins(): common_peers = " + str(common_peers) + "\n") hidden_pair = set(values[box1]) & set(values[box2]) hidden_pair = list(hidden_pair) logger.info("hidden_twins(): hidden_pair = " + str(hidden_pair) + "\n") common_peers = [values[k] for k in common_peers] if not (b_any(hidden_pair[0] in x for x in common_peers) or b_any(hidden_pair[1] in x for x in common_peers)): # logger.info("hidden_twins(): hidden_pair FOUND!!! = " + str(hidden_pair) + "\n") # logger.info("hidden_twins(): hidden_pair FOUND!!! not in common_peers = " + str(common_peers) + "\n") assign_value(values, box1, hidden_pair[0] + hidden_pair[1]) assign_value(values, box2, hidden_pair[0] + hidden_pair[1]) logger.info("hidden_twins(): assigned = " + hidden_pair[0] + hidden_pair[1] + " in " + box1 + " and " + box2 + "\n") logger.info("hidden_twins() UPDATED VALUES : " + str(values) + "\n") return values except Exception as err: logger.error( "eliminate_twins(): Fatal error eliminating naked-twins \n")
def expressao_interrogativa(frase, traducao_glosas, tags): """ Adiciona as expressões faciais em interrogativas globais. :param frase: frase :param traducao_glosas: frase com algumas regras manuais aplicadas :param tags: classes gramaticais das palavras :return: """ if frase[-1] == "?" and not b_any("PT" in x for x in tags) and not b_any( "RGI" in x for x in tags): traducao_glosas = "{" + traducao_glosas + "}(q)" return traducao_glosas
def create_page_rank(self, lambda_value): i_vector = dict() r_vector = dict() list_websites = list(self.graph_bfs) # print(list_websites) for each_website in list_websites: i_vector[each_website] = (1 / len(list_websites)) diff = 1 iteration_count = 1 while diff > 0.0005 and iteration_count <= 4: # print(str(self.graph_bfs['Mars_Rover'])) diffsum = 0 for each_website in list_websites: r_vector[each_website] = lambda_value / len(list_websites) for each_website in list_websites: list_inlinks = self.graph_bfs[each_website] q = [] for each_link in list_inlinks: try: if b_any(each_link in x for x in list_websites) \ and b_any(each_website in x for x in self.graph_bfs[each_link]): q.append(each_link) except KeyError as e: pass if len(q) > 0: for each_outlink in q: r_vector[each_outlink] = r_vector[each_outlink] \ + ((1 - lambda_value)*(i_vector[each_website]/len(q))) else: for each_website_inner in list_websites: r_vector[each_website_inner] = r_vector[each_website_inner] \ + ((1 - lambda_value)*(i_vector[each_website_inner]/len(list_websites))) for each_website in list(r_vector): diffsum += ((r_vector[each_website] - i_vector[each_website])**2) i_vector = copy.deepcopy(r_vector) diff = diffsum**0.5 iteration_count += 1 return r_vector
def _addTopoInfo(theChainDef,chainDict, topoAlgs, doAtL2AndEF=True): maxL2SignatureIndex = -1 for signatureIndex,signature in enumerate(theChainDef.signatureList): if signature['listOfTriggerElements'][0][0:2] == "L2": maxL2SignatureIndex = max(maxL2SignatureIndex,signatureIndex) inputTEsL2 = theChainDef.signatureList[maxL2SignatureIndex]['listOfTriggerElements'] inputTEsEF = theChainDef.signatureList[-1]['listOfTriggerElements'] #topoAlgs = chainDict["topo"] if ('muvtx' in topoAlgs): # import pdb;pdb.set_trace() theChainDef = generateMuonClusterLLPchain(theChainDef, chainDict, inputTEsL2, inputTEsEF, topoAlgs) elif ('revllp' in topoAlgs): theChainDef = generateReversedCaloRatioLLPchain(theChainDef, chainDict, inputTEsL2, inputTEsEF, topoAlgs) elif ('llp' in topoAlgs): theChainDef = generateCaloRatioLLPchain(theChainDef, chainDict, inputTEsL2, inputTEsEF, topoAlgs) elif b_any(('invm' or 'deta' in x) for x in topoAlgs): theChainDef = addDetaInvmTopo(theChainDef,chainDict,inputTEsL2, inputTEsEF, topoAlgs) else: logJet.error('Your favourite topo configuration is missing.') return theChainDef
def stageAlgo(types): desiredDataTypes = ['double'] exclude = ['AZS','Tobins.Q','Tobins.Q.class','AZS.class','Feml.CEO.or.Equiv'] variable_types = {} for index, row in types.iterrows(): if b_any(row["Type"] in x for x in desiredDataTypes): if row["Field"] not in exclude: variable_types[row["Field"]]="c" return (variable_types)
def stageAlgo(types,target): desiredDataTypes = ['double'] targetVars = ['AZS','Tobins.Q','Tobins.Q.class','AZS.class'] variable_types = {} for index, row in types.iterrows(): if b_any(row["Type"] in x for x in desiredDataTypes): if row["Field"] not in targetVars: variable_types[row["Field"]]="c" #add in the target variable_types[target]="c" return (variable_types)
def add_word_objects(step_objs, word_objs): """ Adding the word level objects to the step level ones. :rtype: list """ step_objs = set(step_objs.split(',')) for obj in word_objs: obj = obj.split('>')[0].strip().lower() if obj and not obj.endswith(headless_objs) and not b_any( obj in x for x in step_objs): step_objs.add(obj) return list(step_objs)
def __filter(tokens2d, pno): new_tokens2d = [] new_pno = [] for tokens1d, single_p in zip(tokens2d, pno): # here sent is a sentence of word sequences, and label is a sequence of labels for a sentence. # check if any of the labels in this sentence have POSITIVE_LABEL in them, if they do, then consider that # sentence, else discard that sentence. if b_any(cfg.POSITIVE_LABEL in token.label for token in tokens1d): new_tokens2d.append(tokens1d) new_pno.append(single_p) return new_tokens2d, new_pno
def create_graph_from_map(self): list_urls = list(self.graph) url_vs_incoming_links = dict() for each_url in list_urls: in_links = [] for listed_url in list_urls: if b_any(each_url in x for x in self.graph[listed_url]): in_links.append(listed_url) url_vs_incoming_links[each_url] = in_links print("--------------------Graph--------------------") print(url_vs_incoming_links) self.write_dictionary_to_file(url_vs_incoming_links)
def __receiveThread(self): 'Hilo donde se imprimen los mensajes entrantes' while self.comPort: if self.comPort.inWaiting() > 0: c = self.comPort.readline() if(self.recuperarMensaje != None and self.recuperarMensaje in c.decode()): #En el caso de que se precise recuperar un mensaje self.mensajeRecuperado=c.decode() if(b_any( x in c.decode() for x in wordsToPrint)): if(wordsToPrint[6] in c.decode()): msg = c.decode().split(",") print("\n Mensaje desde Topic: "+msg[1] + "\n > "+msg[3]) else: print(c.decode()) if (debug): print("Debugger > " + c.decode()) else: time.sleep(0.1)
def move_files(destdir, maindir): """ Consolidates files from multi-zip download into a single directory for each subject. """ print(f"Moving files from {maindir} and consolidating into {destdir}\n") if not os.path.exists(destdir): os.mkdir(destdir) # Generate list of full filepaths for each image file fileroots = list([os.path.join(root,filename) for root,dirnames,filenames \ in os.walk(maindir) if len(root.split('/'))==7 \ for filename in filenames]) # Check if a directory already exists along the filepath to each image, # so that images from the same directory are consolidated together. # Otherwise, create the directory for the image in the destination dir. for filepath in fileroots: filedest = list([rootd for rootd,dirnamesd,filenamesd \ in os.walk(destination)]) for step in range(3, len(filepath.split('/')[:-1])): if not b_any('/'.join(filepath.split('/')[3:step+1]).upper()==\ '/'.join(dest.split('/')[2:step]).upper() \ for dest in filedest if len(dest.split('/'))>=step): os.mkdir(os.path.join(destination,\ '/'.join(filepath.split('/')[3:step+1]))) os.rename(filepath, os.path.join(destination, '/'.join(filepath.split('/')[3:]))) print(f'... empyting {maindir} ...\n') # Empty the Downloads folder while len( list([ dwnld_dir for dwnld_dir in os.listdir(maindir) if os.path.isdir(os.path.join(maindir, dwnld_dir)) ])) > 0: #len(os.listdir(maindir)) > 2: for root, dirnames, filenames in os.walk(maindir): if os.path.isfile(os.path.join(root, '.DS_Store')): os.remove(os.path.join(root, '.DS_Store')) if os.path.isdir(root): if not os.listdir(root): os.rmdir(root) print( f"All files moved from {maindir} and consolidated into {destdir}\n\n") return
def filter(self): # tokens2d, pos_tags, and conll_deps are filtered if a sentence was not tagged new_tokens2d = [] new_pos_tags = [] new_conll_deps = [] for tokens1d, pos_tag1d, deps1d in zip(self.tokens2d, self.pos_tags, self.conll_deps): # here tokens1d is a sentence of word sequences, and label is a sequence of labels for a sentence. # check if any of the labels in this sentence have POSITIVE_LABEL in them, if they do, then consider that # sentence, else discard that sentence. if b_any(cfg.POSITIVE_LABEL in token.label for token in tokens1d): new_tokens2d.append(tokens1d) new_pos_tags.append(pos_tag1d) new_conll_deps.append(deps1d) self.tokens2d = new_tokens2d self.pos_tags = new_pos_tags self.conll_deps = new_conll_deps
def getDailyMailArticleLinks(self, topic, page_number): links = [] page_distance = 50 off_set = until = 0 # Unlike independent, daily mail does not store topic in news article and must append this manually url_apendix = "|" + topic try: off_set = (page_number - 1) * page_distance if page_number == 1: until = page_distance else: until = ((page_number - 1) * page_distance) * 2 url = ('https://www.dailymail.co.uk/home/search.html?offset={}&size={}&sel=site&searchPhrase={}&sort=recent&type=article&type=video&type=permabox&days=all'.format(off_set, until, topic)) # Get the html for the page response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') # 1. Get the articles div articles_div = soup.find('div',attrs={'class':'sch-results'}) # Get all of the divs that contain links article_divs = articles_div.findAll('div') # 2. First 2 and last article div does not contain links for article in article_divs[2:-1]: for url in article.findAll('a'): if '#' not in url.get('href'): # Ensure the link has not already been added somehow if not b_any(url.get('href') in link for link in links): # print('https://www.dailymail.co.uk/' + url.get('href')) links.append('https://www.dailymail.co.uk/' + url.get('href') + url_apendix) except AttributeError: print('Failed to find what we looked for') print('Sleeping...') time.sleep(10) return links
def search(files, mod, tag, search): """ Return a dictionary of files containing the specific tag, as well as the tags that exist in those files for additional context to be provided to the user. """ # construct relevant pieces search_construct = mod + search tags_per_file = dict() for file in files: with open(file, 'r') as fl: contents = fl.read() # get all tags in a file taglist = set(re.findall(search_construct, contents)) # strip modifier from all tags taglist_stripped = [tag.strip(mod) for tag in taglist] # substring match our tag against the items in taglist_stripped if b_any(tag in tags for tags in taglist_stripped): # file contains a match, store all tags for context tags_per_file[file] = taglist_stripped return tags_per_file
def fetch(region): print('>Fetching {0}'.format(region)) url = 'https://www.wowhead.com/world-quests/{0}'.format(region) req = Request(url) try: urlcleanup() response = urlopen(req) except URLError as e: if hasattr(e, 'reason'): print(' We failed to reach a server.') print(' Reason: ', e.reason) elif hasattr(e, 'code'): print(' The server couldn\'t fulfill the request.') print(' Error code: ', e.code) return None, None else: html = response.read() soup = BeautifulSoup(html, 'html.parser') # get text text = soup(["script"]) for tag in text: if b_any('lvWorldQuests' in word for word in tag.contents): for line in tag.contents: moreLines = line.split('\n') wqLines = list( filter(lambda x: 'lvWorldQuests' in x, moreLines)) for wqLine in wqLines: _lines = wqLine.split(', data: [') quests = json.loads('[' + _lines[1][:-4] + ']') return quests return 'worldQuest'
def get_sites(permissions): site_slugs = [] if b_any('scribd' in x for x in permissions): site_slugs.append('SC') if b_any('google' in x.lower() for x in permissions): site_slugs.append('GB') if b_any('kobo' in x.lower() for x in permissions): site_slugs.append('KO') if b_any('test' in x.lower() for x in permissions): site_slugs.append('TB') if b_any('livraria' in x.lower() for x in permissions): site_slugs.append('LC') if b_any('audio' in x.lower() for x in permissions): site_slugs.append('AU') return site_slugs
def parser(ingredient, food_array): if type(ingredient) != str: return parsed_word = '' # Removes unnecessary special characters ingredient = ingredient.strip() ingredient = ingredient.replace('-', ' ') ingredient = ingredient.replace('+', ' ') ingredient = ingredient.replace(':', ' ') ingredient = ingredient.replace(';', ' ') ingredient = ingredient.replace('/', ' ') ingredient = ingredient.replace('\'', ' ') ingredient = ingredient.replace('\"', ' ') ingredient = ingredient.replace('%', ' ') ingredient = ingredient.replace('.', ' ') ingredient = ingredient.replace('&', ' ') ingredient = ingredient.replace('[', ' ') ingredient = ingredient.replace(']', ' ') ingredient = ingredient.replace('®', '') ingredient = ingredient.replace('\u2009', ' ') # Breaks each word into a string array split_item = ingredient.split(" ") # print(split_item) for word in split_item: word = word.lower() #Takes care of wholenumbers, decimals, and fractions if word.isnumeric() or word.isdecimal(): continue elif b_any(word in x for x in description_exceptions): parsed_word = word + ' ' continue elif word in nonplurals: word = word[:-1] parsed_word = parsed_word + word + ' ' continue elif ',' in word: last_word = word.replace(',', '') if last_word in nonplurals: last_word = last_word[:-1] parsed_word = parsed_word + last_word break elif word == 'or': break elif word == 'and': parsed_word = parsed_word.rstrip() # food_array.append(parsed_word) parsed_word = '' continue elif '(' in word or ')' in word: continue elif b_any(word in x for x in vulgarFractions): continue elif b_any(word in x for x in measurementUnits): continue elif b_any(word in x for x in measurementUnitsAbbreviations): continue elif b_any(word in x for x in numbers): continue elif b_any(word in x for x in brands): continue elif b_any(word in x for x in descriptions): continue elif b_any(word in x for x in modifier): continue elif b_any(word in x for x in precedingAdverbs): continue elif b_any(word in x for x in succeedingAdverbs): continue elif b_any(word in x for x in prepositions): continue elif b_any(word in x for x in descriptionsWithPredecessor): continue elif b_any(word in x for x in unnecessaryDescriptions): continue elif b_any(word in x for x in hypenatedPrefixes): continue elif b_any(word in x for x in hypenatedSuffixes): continue else: parsed_word = parsed_word + word + ' ' parsed_word = parsed_word.strip() #return parsed_word #print(parsed_word) #Prevent's blank spots in ingredients array if parsed_word == '': return food_array else: food_array.append(parsed_word) return food_array
def func_depd_pruning(self): ''' function for determining functional dependencies using the naive approach. :return: None ''' input = ['movieid', 'type', 'startyear', 'runtime', 'avgrating', 'genreid', 'genre', 'memberid', 'birthyear', 'role'] output = sum([list(map(list, combinations(input, i))) for i in range(3)], []) output.pop(0) # deleting the empty set concat_list = [', '.join(sub_list) for sub_list in output] table_list=[] table_dict={} for column in concat_list: column_list = [] query = "SELECT array_agg(nid) FROM normalization GROUP BY "+ column +" order by " + column self.cursor.execute(query) arrays = self.cursor.fetchall() for array in arrays: array=str(array) array = array.strip('()[],') column_list.append(array.translate('()[]').split(', ')) table_list.append(column_list) table_dict.clear() for i in range(0,len(concat_list)): table_dict[concat_list[i]] = table_list[i] func_depd=[] for left_col in table_dict.keys(): for right_col in input: count=0 lolleft=table_dict[left_col] lolright=table_dict[right_col] for left_list in lolleft: for right_list in lolright: if set(left_list) <= set(right_list): count+=1 break else: continue if count == len(lolleft): leftcollist=left_col.split(", ") someflag=True for col in leftcollist: if col.strip(" ") != right_col.strip(" "): continue else: someflag=False if someflag: word = "-->" + str(right_col) if not b_any(word in x for x in func_depd): func_depd.append(left_col + "-->"+ right_col) print(func_depd)
def main(*args): stock_path = 'Stock_Data' if args[0] == 1: rmtree(stock_path) if not os.path.exists(stock_path): os.makedirs(stock_path) start = str(datetime.date.today() - datetime.timedelta(days=int(args[1] * 365))) end = str(datetime.date.today()) tickers = pd.read_csv(args[2]) onlyfiles = [f for f in listdir(stock_path) if isfile(join(stock_path, f))] Doneticks = [i.split('.csv', 1)[0] for i in onlyfiles] skipticks = [] for tick, comp, divi, mc in zip(tickers['Symbol'], tickers['Name'], tickers['Dividend'], tickers['Market Cap, Billions']): f_str = tick + '__' + comp if not b_any(f_str in x for x in Doneticks): try: df = pdr.get_data_yahoo(tick, start, end) if len(df) > 0: df = df.reset_index() df['Count'] = df.index df['Week Day'] = -1 df['Change'] = ((df['Close'] / df['Open']) - 1) * 100 for index, row in df.iterrows(): date = row['Date'] wk_day = date.weekday() df.at[index, 'Week Day'] = wk_day period = 14 delta = df['Close'].diff() dUp, dDown = delta.copy(), delta.copy() dUp[dUp < 0] = 0 dDown[dDown > 0] = 0 RS = dUp.rolling(period).mean() / dDown.rolling( period).mean().abs() df['RSI'] = 100.0 - (100.0 / (1.0 + RS)) df = df.fillna(df['Close'].min()) fname = stock_path + '/' + tick + '__' + comp + '.csv' df.to_csv(fname, sep=',', encoding='utf-8', index=False) else: tickers = tickers[tickers.Symbol != tick] except Exception: skipticks.append(tick) pass tickers.to_csv(args[2][:-4] + '-new.csv', sep=',', encoding='utf-8', index=False) # saves out csv with delisted tickers if len(listdir(stock_path)) < 7000: print( f'Missing some stocks (returned only {len(listdir(stock_path))} when there is likely more)' )
def does_sent_have_tags(labels): return b_any('B' in x or 'I' in x for x in labels)
# rsi_df['Below 30']=downs # rsi_df1=rsi_df[rsi_df['Above 70']>0] # rsi_df2=rsi_df1[rsi_df1['Below 30']>0] # rsi_df3=rsi_df2[rsi_df2['Total']>50] Files_for_plotting= listdir(stock_path) Files_for_plotting = Files_for_plotting[1:3] for ff in Files_for_plotting: t_c=ff.split('.csv', 1)[0] tick=t_c.split('__', 1)[0] company=t_c.split('__', 1)[1] check=tick + '_'+company if not b_any(check in x for x in Doneticks_plots): csvname = stock_path+'/'+ff df_data=pd.read_csv(csvname, encoding="ISO-8859-1") x1=df_data['Count'].reset_index() x1=x1+(1260-np.max(x1)) y1=df_data['Close'].reset_index() x1=x1.drop(['index'], axis=1) y1=y1.drop(['index'], axis=1) if len(x1)>30: x2=x1.tail(520) x3=x1.tail(260) x4=x1.tail(195) x5=x1.tail(130)
def create_and_store_lyrics(sp, artist_name=None, artist_dic=None): # this is for scraping genius.com dictionary = enchant.Dict("en_US") albums = sp.artist_albums(artist_id=artist_name, limit=50) songs = [] for i in range(len(albums['items'])): album_uri = albums['items'][i]['uri'] album_tracks = sp.album_tracks(album_uri) for j in range(len(album_tracks['items'])): album_song = album_tracks['items'][j]['name'] songs.append(album_song) global_list = [] for title in songs: j = title.split() local_list = [] for word in j: if word.isalnum() and dictionary.check(word) == True: local_list.append(word.lower()) global_list.append('-'.join(local_list)) res = [] for i in range(len(global_list)): try: r = requests.get('https://genius.com/Genius-translations-' + dic[artist_dic] + '-' + global_list[i] + '-english-translation-lyrics') soup = BeautifulSoup(r.text, 'lxml') lyrics = soup.find('div', class_='lyrics').get_text() if [global_list[i], lyrics] not in res: res.append([global_list[i], lyrics]) except Exception as e: r = requests.get('https://genius.com/' + dic[artist_dic].title() + '-' + global_list[i] + '-lyrics') soup = BeautifulSoup(r.text, 'lxml') try: lyrics = soup.find('div', class_='lyrics').get_text() if 'English Translation' in lyrics: lyrics_split = lyrics.split('English Translation') search_list = [ 'Japanese Translation', 'Romanized', 'Korean Original', 'Hangul', 'French Translation', 'Romanization', 'Chinese Translation', 'Original', 'Chinese', 'Japanese', 'French', 'Korean' ] s = lyrics.split('English Translation')[1].split() for index in range(len(s)): next_index = index + 1 if b_any(substring in lyrics for substring in search_list): while s[next_index] not in search_list and next_index < len( s) - 1: next_index += 1 english_lyrics = ' '.join(s[index:next_index]) break else: english_lyrics = ' '.join(s) if [global_list[i], english_lyrics] not in res: res.append([global_list[i], english_lyrics]) except Exception as e: pass df = pd.DataFrame(res).to_csv('lyrics_data/' + dic[artist_dic] + '_translated_lyrics.csv')
# Acronym # Robyn Lesch # 17 June 2020 # Mood: exhausted ignoreStrings = input("Enter words to be ignored separated by commas: \n") ignoreList = ignoreStrings.split(",") title = input("Enter a title to generate its acronym: \n") titleList = title.split() from builtins import any as b_any print("The acronym is:") for x in titleList: if (b_any(x.lower() in y.lower() for y in ignoreList)): print("", end="") else: print(x[0].upper(), end="")
def create_and_store_lyrics(sp, artist_name=None, artist_dic=None): # this is for scraping genius.com dictionary = enchant.Dict("en_US") albums = sp.artist_albums(artist_id=artist_name, limit=50) songs = [] for i in range(len(albums['items'])): album_uri = albums['items'][i]['uri'] album_tracks = sp.album_tracks(album_uri) for j in range(len(album_tracks['items'])): album_song = album_tracks['items'][j]['name'] songs.append(album_song) global_list = [] for title in songs: j = title.split() local_list = [] for word in j: if word.isalnum() and dictionary.check(word) == True: local_list.append(word.lower()) global_list.append('-'.join(local_list)) res = [] for i in range(len(global_list)): try: r = requests.get('https://genius.com/' + dic[artist_dic] + '-' + global_list[i] + '-lyrics') soup = BeautifulSoup(r.text, 'lxml') lyrics = soup.find('div', class_='lyrics').get_text().lower() split_list = ['hangul', 'korean', 'korean original'] search_list = [ 'japanese', 'romanized', 'french', 'romanization', 'original', 'chinese', 'japanese', 'english', 'translation' ] total_list = split_list + search_list if not b_any( substring in lyrics for substring in total_list ): # 1) if this is just pure Korean text. Sometimes there will be translated lyrics without explicit mention of translations, resulting in these lyrics being stored. if [global_list[i], lyrics] not in res: res.append([global_list[i], lyrics]) # just append the Korean text else: if any( substring in lyrics for substring in split_list ): # 2) if this has Korean lyrics mixed with other translations: split_string = [ i for i in filter(lambda x: x in lyrics, split_list) ][0] s = lyrics.split(split_string)[1].split() for index in range(len(s)): next_index = index + 1 if b_any(substring in lyrics for substring in search_list): while s[next_index] not in search_list and next_index < len( s) - 1: next_index += 1 korean_lyrics = ' '.join( s[index:next_index] ) # use sliding window to find the end of korean lyrics through key words in search_list and extract the korean lyrics break if [ global_list[i], korean_lyrics ] not in res: # append the found Korean lyrics to the result list res.append([global_list[i], korean_lyrics]) else: # 3) if these are foreign translations without Korean words, just drop it pass except Exception as e: # if the link itself doesn't work, just drop the song pass df = pd.DataFrame(res).to_csv('../lyrics_data/Korean_lyrics/' + dic[artist_dic] + '_original_lyrics.csv')
def create_dict_list_with_key(n: int, value_input: np.ndarray, in_key: Union[str, list]) -> list: dict_list = [] for i in range(0, n): # iterate over samples param_dict = dict() for j in range( 0, get_dimension(in_key)): # iterate over input parameters # if key is nested key, extract just the last bit to find the type tmp_parts = str.split(in_key[j], '.') bool_set_key = True if len(tmp_parts) > 1: key = tmp_parts[-1] # last entry else: key = in_key[j] if b_any(key in x for x in INTEGER_PARAMETERS ): # self.key[j] in INTEGER_PARAMETERS: value = int(np.round(value_input[j, i])) elif b_any(key in x for x in LIST_PARAMETERS): value = [value_input[j, i]] # convert to list elif b_any(key in x for x in GROUP_PARAMETERS): value = [ 1 - value_input[j, i], 1 - value_input[j, i] ] # convert to list with percentage of singles and couples elif b_any(key in x for x in TOPOGRAPHY_SYMMETRY_PARAMETERS): # todo: special file or sth for this config that is only for one specific scenario file # vertical center: 7.5 v_center_line = 7.5 obstacle_height = 6.0 my_key = "obstacles.[id==1].y" # lower obstacle (Liddle_bhm_v3) my_value = -value_input[ j, i] / 2 + v_center_line - obstacle_height param_dict[my_key] = my_value my_key = "obstacles.[id==2].y" # upper obstacle my_value = value_input[j, i] / 2 + v_center_line param_dict[my_key] = my_value distance_to_obstacle = 0.3 # distance between intermediate target and obstacle height_of_obstacle = 10 # for intermediate target # my_key = "targets.[id==1].height" # my_value = value_input[j, i] # target_1_height = my_value - distance_to_obstacle * 2 # param_dict[my_key] = target_1_height # # my_key = "targets.[id==1].y" # intermediate target # my_value = v_center_line - target_1_height / 2 # param_dict[my_key] = my_value my_key = "targets.[id==1].height" target_2_height = value_input[j, i] param_dict[my_key] = target_2_height my_key = "targets.[id==1].y" # target my_value = v_center_line - target_2_height / 2 param_dict[my_key] = my_value bool_set_key = False else: value = value_input[j, i] if bool_set_key: param_dict[in_key[j]] = value dict_list.append(param_dict) return dict_list
maxlen = 25 char_idx = None if os.path.isfile(char_idx_file): print('Loading previous char_idx') char_idx = pickle.load(open(char_idx_file, 'rb')) X, Y, char_idx = \ textfile_to_semi_redundant_sequences( path, seq_maxlen=maxlen, redun_step=1) pickle.dump(char_idx, open(char_idx_file, 'wb')) # Instantiating checkpoint finder checkpoint = False list_of_files = os.listdir() checkpoint_type = ".data-00000-of-00001" if b_any(checkpoint_type in x for x in list_of_files): checkpoint = True def extract_number(f): s = re.findall("(\d+).data-00000-of-00001", f) return (int(s[0]) if s else -1, f) target = (max(list_of_files, key=extract_number)) target = target.split('.') target = target[0] # Begin Main loop with tf.device('/cpu:0'): # Launch tensorboard (This is disabled as it causes Python to crash) #os.spawnl(os.P_NOWAIT, "tensorboard --logdir='/tmp/tflearn_logs/" + ID + "'") #os.spawnl(os.P_NOWAIT, "start \"\" http://localhost:6006")
def func_depd_pruning(self): ''' function for determining functional dependencies using the pruning approach. This function determines functional dps for company as a sample. We have run the same code for other tables in our model and the results are documented in the write-up :return: None ''' input = ['ticker', 'exchange', 'company_name', 'sector', 'industry'] output = sum([list(map(list, combinations(input, i))) for i in range(3)], []) output.pop(0) # deleting the empty set concat_list = [', '.join(sub_list) for sub_list in output] table_list = [] table_dict = {} for column in concat_list: column_list = [] query = "SELECT array_agg(ticker) FROM company GROUP BY " + column + " order by " + column self.cursor.execute(query) arrays = self.cursor.fetchall() for array in arrays: array = str(array) array = array.strip('()[],') column_list.append(array.translate('()[]').split(', ')) table_list.append(column_list) table_dict.clear() for i in range(0, len(concat_list)): table_dict[concat_list[i]] = table_list[i] func_depd = [] for left_col in table_dict.keys(): for right_col in input: count = 0 lolleft = table_dict[left_col] lolright = table_dict[right_col] for left_list in lolleft: for right_list in lolright: if set(left_list) <= set(right_list): count += 1 break else: continue if count == len(lolleft): leftcollist = left_col.split(", ") someflag = True for col in leftcollist: if col.strip(" ") != right_col.strip(" "): continue else: someflag = False if someflag: word = "-->" + str(right_col) if not b_any(word in x for x in func_depd): func_depd.append(left_col + "-->" + right_col) print(func_depd)
for lineNum, line in enumerate(data,1): currLine = [] currLine = line.split() for wordNum, word in enumerate(currLine,1): if not word.isdigit(): if word not in en: if word not in lib.keys(): stemmed = stemmer.stem(word) if stemmed not in lib.keys(): #print(stemmed) typo[stemmed] = Typo(lineNum,wordNum,word) for word in typo.keys(): print(word) for x in range(0, len(word)): if not b_any(i.startswith(word[0:x]) for i in lib.keys()): typo[word].setErrorIndexFront(x) #print(word[0:x]) #print(word[0:typo[word].getErrorIndexFront() - 1]) break for x in range(len(word) - 2, 0, -1): if not b_any(i.endswith(word[x:len(word)]) for i in lib.keys()): typo[word].setErrorIndexRear(x) #print(word[x:len(word)]) #print(word[typo[word].getErrorIndexRear()+1:len(word)]) break for sug in lib.keys(): if abs(len(sug) - len(word))<2: if sug.startswith(word[0:typo[word].getErrorIndexFront() - 2]):
def find_files(in_path, ext, targets, template='(?<=\d{2})\d{5}', sub=False): """ Finds matching files with extension ext and returns them in the order of the targets list given as argument Returns a dictionary identical to what I was using before Also drops duplicates """ # Go through each directory and see if I can find the subjects I am looking # for ext = '*{}'.format(ext) out_dict = {key: [] for key in ['sub_name', 'dir', 'path']} if not sub: sub_dirs = [d for d in os.walk(in_path).next()[1]] for sub_dir in sub_dirs: tmp_dir = os.path.join(in_path, sub_dir) in_files = glob.glob(os.path.join(tmp_dir, ext)) tmp_dict = dict() # Get the files that we have matches = [x for x in targets if b_any(str(x) in t for t in in_files)] for in_file in in_files: sub_name = os.path.basename(in_file.split('.')[0]) sub_id = int(re.search(r'{}'.format(template), sub_name).group()) if sub_id in tmp_dict.keys(): # This is a duplicate continue tmp_dict[sub_id] = (sub_name, in_file) # Re-sort the path info sort_list = list() for target in matches: sub_name, in_file = tmp_dict[target] out_dict['sub_name'].append(sub_name) out_dict['dir'].append(sub_dir) out_dict['path'].append(in_file) else: sub_dir = sub tmp_dir = os.path.join(in_path, sub_dir) in_files = glob.glob(os.path.join(tmp_dir, ext)) tmp_dict = dict() # Get the files that we have matches = [x for x in targets if b_any(str(x) in t for t in in_files)] for in_file in in_files: sub_name = os.path.basename(in_file.split('.')[0]) sub_id = int(re.search(r'{}'.format(template), sub_name).group()) if sub_id in tmp_dict.keys(): # This is a duplicate continue tmp_dict[sub_id] = (sub_name, in_file) for target in matches: sub_name, in_file = tmp_dict[target] out_dict['sub_name'].append(sub_name) out_dict['dir'].append(sub_dir) out_dict['path'].append(in_file) return out_dict