def __init__(self, origin, destination): self.origin = DataFiles(origin) self.destination = DataFiles(destination) self.changes = [] self.move = [] self.origin.computeChecksums() self.destination.computeChecksums()
def main(): file_manager = DataFiles() edge_data = file_manager.get_edge_data() edge_data = list(set(edge_data)) file_manager.populate_cache() # We have a dictionary, where the key is the node number. The data follows # all_bands = {'nid': {"title": "Fleetwood Mac", "genres}} all_bands = {} for genre in file_manager.genres: for band in file_manager.cache[genre]: # If the band was already stored with a different genre. The ID is the same if band[0] in all_bands: all_bands[band[0]]["genres"].append(genre) continue all_bands[band[0]] = {} all_bands[band[0]]["title"] = band[1] all_bands[band[0]]["genres"] = [genre] all_bands[band[0]]["in_conns"] = [] all_bands[band[0]]["out_conns"] = [] all_bands[band[0]]["is_dup"] = False # For later use all_bands[band[0]]["link_to"] = "" # If it is a duplicate for edge in edge_data: for _ in range(int(edge[2])): all_bands[edge[0]]["out_conns"].append(edge[1]) all_bands[edge[1]]["in_conns"].append(edge[0]) # Disambiguation. Two bands with the same name, but different IDs # max_node = file_manager.max_node # TODO: Fix and implement this instead max_node = max([int(key) for key in all_bands]) for i in range(max_node + 1): if str(i) not in all_bands: continue if all_bands[str(i)]["is_dup"]: continue for j in range(i + 1, max_node + 1): if str(j) not in all_bands or all_bands[str(j)]["is_dup"]: continue if all_bands[str(i)]["title"] == all_bands[str(j)]["title"]: if set(all_bands[str(i)]["out_conns"]) != set( all_bands[str(j)]["out_conns"]): print("The out connections didn't match?!?!", i, j) all_bands[str(i)]["in_conns"] += all_bands[str(j)]["in_conns"] all_bands[str(i)]["genres"] = list( set( list(all_bands[str(i)]["genres"] + all_bands[str(j)]["genres"]))) all_bands[str(j)] = {"is_dup": True, "link_to": str(i)} with open('data/cleaned.txt', 'w') as writer: writer.write(json.dumps(all_bands))
def check_all(): file_manager = DataFiles() urls = file_manager.get_all_urls() for url in urls: # url = "/wiki/Lindsey_Buckingham" is_a_band = check_valid_page(url) if not is_a_band: print(base_url + url) break
def initial_scrape(curr_genre): params = { "rock": { "url": "https://en.wikipedia.org/wiki/List_of_rock_music_performers", "inds": [1, 28] }, "country": { "url": "https://en.wikipedia.org/wiki/List_of_country_music_performers", "inds": [1, 27] }, "metal": { "url": "https://en.wikipedia.org/wiki/List_of_heavy_metal_bands", "inds": [] } } curr_page = params[curr_genre] r = requests.get(curr_page["url"]) soup = BeautifulSoup(r.text, 'html.parser') if curr_genre in ["country", "rock"]: bands_list = soup.findAll( "ul")[curr_page["inds"][0]:curr_page["inds"][1]] bands_list = "".join([str(band) for band in bands_list]) elif curr_genre in ["metal"]: bands_list = soup.find("table", { "class": "wikitable" }).tbody.findAll("tr") bands_list = ''.join([ str(row.findAll("td")[0]) for row in bands_list if len(row.findAll("td")) > 0 ]) else: raise Exception("Invalid genre!") soup = BeautifulSoup(bands_list, "html.parser") hrefs = soup.findAll('a', href=True, title=True) urls = [a['href'] for a in hrefs] titles = [a['title'] for a in hrefs] if len(urls) != len(titles): raise Exception("Oops! There was a parsing error!") file_handler = DataFiles() file_handler.set_genre(curr_genre) file_handler.write_header() for i in range(len(urls)): file_handler.write_band(titles[i], urls[i])
def selectTeachers(self, filename): files = DataFiles() steps = files.readFile(filename) iterations = len(steps[0]) tries = len(steps) finalRate = 0.01 #Considering the last 1% of performed actions totalFinalSteps = np.zeros(tries) minStep = iterations - int(iterations*finalRate) for i in range(tries): acum = 0 for j in range(minStep, iterations): acum += steps[i][j] #endfor totalFinalSteps[i] = acum #endfor totalFinalStepsTidy = np.argsort(totalFinalSteps) bestAgent = totalFinalStepsTidy[0] medianAgent = totalFinalStepsTidy[int(tries/2)] worstAgent = totalFinalStepsTidy[tries-1] return bestAgent, medianAgent, worstAgent
class AstroMerger(): def __init__(self, origin, destination): self.origin = DataFiles(origin) self.destination = DataFiles(destination) self.changes = [] self.move = [] self.origin.computeChecksums() self.destination.computeChecksums() #def compareFiles(self): #o = self.origin.checksums #d = self.destination.checksums #for file in o.keys(): #if file not in d: #self.newFiles.append(file) #else: #if o[file] != d[file]: #self.newVersions.append(file) def compareFiles(self): o = self.origin.checksums d = self.destination.checksums oldVersions = self.destination.checksums.keys() for file in o.keys(): if file not in d or o[file] != d[file]: self.move.append(file) entry = {"Date": time.asctime(time.localtime(time.time()))} entry["File"] = file if file in oldVersions: entry["Status"] = "ALTERED" else: entry["Status"] = "NEW" self.changes.append(entry) def migrateFiles(self): currentDir = os.getcwd() self.origin.accessPath oldVersions = self.destination.checksums.keys() self.origin.accessPath() for f in os.listdir("."): if f in self.move: shutil.copy(f, self.destination.path) os.remove(f) os.chdir(currentDir) def registerChanges(self, logname="log.json"): with open(logname, 'w') as f: json.dump(self.changes, f, sort_keys=True, indent=4, ensure_ascii=False)
# This program goes through every page in the current set of valid URLs and scan their pages for valid neighbor nodes # This is a one-step BFS gathering of new nodes # This is purely for data collection. No links are being established at this point import requests from bs4 import BeautifulSoup from DataFiles import DataFiles from confirm_page_structure import check_valid_page base_url = "https://en.wikipedia.org/" file_manager = DataFiles() for url in file_manager.get_all_urls(): r = requests.get(base_url + url) # r = requests.get("https://en.wikipedia.org/wiki/Christine_McVie") print(url) soup = BeautifulSoup(r.text, 'html.parser') main_content = soup.findAll("div", {"id": "content"}) # If, for some bizarre reason, there's not exactly one id="content" div tag if len(main_content) != 1: main_content = soup else: main_content = main_content[0] for a in main_content.findAll("a", href=True): res = file_manager.test_valid_url(a["href"]) if res == 0: # print(url)
def loadQValues(self, filename): files = DataFiles() self.Q = files.readFloatFile(filename)
def saveQValues(self, filename): files = DataFiles() files.createFile(filename) for i in range(self.numberOfStates): files.addFloatToFile(filename, self.Q[i])
# This file goes through all genres import requests from bs4 import BeautifulSoup from DataFiles import DataFiles base_url = "https://en.wikipedia.org/" file_manager = DataFiles() for url in file_manager.get_all_urls(): # url = "/wiki/Tears_for_Fears" r = requests.get(base_url + url) # r = requests.get("https://en.wikipedia.org/wiki/Christine_McVie") print(url) soup = BeautifulSoup(r.text, 'html.parser') main_content = soup.findAll("div", {"id": "content"}) # If, for some bizarre reason, there's not exactly one id="content" div tag if len(main_content) != 1: main_content = soup else: main_content = main_content[0] # Ensure we only add one of each connection with a set add_urls = dict() for a in main_content.findAll("a", href=True): href = str(a["href"]) res = file_manager.test_valid_url(href)
--------- 1) TRAINING: Almacenamos cada instancia de entrenamiento en una lista (training_list) 2) CLASSIFICATION 2.1) Dada una instancia de la cual queremos predecir la clase 2.1.1) obtenemos la distancia a cada una de las instancias ####lista_distancias<-getDistanciaACadaInstancia(instancia, m) |NO::::2.1.1.1) por cada instancia 'i' de la training_list: |NO:::: lista_distancias<-calcularDistanciaMinkowski(instancia, training_list[i], m) |NO:::: return lista_distancias 2.1.2) ordenar las distancias #####sort(lista_distancias) 2.1.3) obtener las k-instancias con distancia minimas 2.2) Etiquetamos la instancia con la clase mayoritaria ''' if __name__ == '__main__': #main start_time = datetime.now() data = DataFiles() preproceso = Preproceso() evaluacion = Evaluacion() try: #1. LEER FICHERO inputPath = sys.argv[1]; input_instances = data.obtenerInstanciasDeFichero(inputPath) #1.2 cargamos el fichero input_instances.calcNumClases() #2. PREPROCESO preproceso.randomizarInstancias(input_instances) #3. kNN #70% instancias con clase, 30% instancias a clasificar k = int(sys.argv[2]) #num de vecinos a explorar m = float(sys.argv[3]) #m de la dist de Minkowski porcentaje = sys.argv[4]