コード例 #1
0
ファイル: AstroMerger.py プロジェクト: sstsalazar/AstroMerger
 def __init__(self, origin, destination):
     self.origin = DataFiles(origin)
     self.destination = DataFiles(destination)
     self.changes = []
     self.move = []
     self.origin.computeChecksums()
     self.destination.computeChecksums()
コード例 #2
0
def main():
    file_manager = DataFiles()
    edge_data = file_manager.get_edge_data()
    edge_data = list(set(edge_data))
    file_manager.populate_cache()

    # We have a dictionary, where the key is the node number. The data follows
    # all_bands = {'nid': {"title": "Fleetwood Mac", "genres}}
    all_bands = {}

    for genre in file_manager.genres:
        for band in file_manager.cache[genre]:
            # If the band was already stored with a different genre. The ID is the same
            if band[0] in all_bands:
                all_bands[band[0]]["genres"].append(genre)
                continue

            all_bands[band[0]] = {}
            all_bands[band[0]]["title"] = band[1]
            all_bands[band[0]]["genres"] = [genre]
            all_bands[band[0]]["in_conns"] = []
            all_bands[band[0]]["out_conns"] = []
            all_bands[band[0]]["is_dup"] = False  # For later use
            all_bands[band[0]]["link_to"] = ""  # If it is a duplicate

    for edge in edge_data:
        for _ in range(int(edge[2])):
            all_bands[edge[0]]["out_conns"].append(edge[1])
            all_bands[edge[1]]["in_conns"].append(edge[0])

    # Disambiguation. Two bands with the same name, but different IDs

    # max_node = file_manager.max_node  # TODO: Fix and implement this instead
    max_node = max([int(key) for key in all_bands])

    for i in range(max_node + 1):
        if str(i) not in all_bands:
            continue
        if all_bands[str(i)]["is_dup"]:
            continue
        for j in range(i + 1, max_node + 1):
            if str(j) not in all_bands or all_bands[str(j)]["is_dup"]:
                continue
            if all_bands[str(i)]["title"] == all_bands[str(j)]["title"]:
                if set(all_bands[str(i)]["out_conns"]) != set(
                        all_bands[str(j)]["out_conns"]):
                    print("The out connections didn't match?!?!", i, j)

                all_bands[str(i)]["in_conns"] += all_bands[str(j)]["in_conns"]
                all_bands[str(i)]["genres"] = list(
                    set(
                        list(all_bands[str(i)]["genres"] +
                             all_bands[str(j)]["genres"])))

                all_bands[str(j)] = {"is_dup": True, "link_to": str(i)}

    with open('data/cleaned.txt', 'w') as writer:
        writer.write(json.dumps(all_bands))
コード例 #3
0
def check_all():
    file_manager = DataFiles()
    urls = file_manager.get_all_urls()

    for url in urls:
        # url = "/wiki/Lindsey_Buckingham"
        is_a_band = check_valid_page(url)

        if not is_a_band:
            print(base_url + url)

        break
コード例 #4
0
def initial_scrape(curr_genre):
    params = {
        "rock": {
            "url":
            "https://en.wikipedia.org/wiki/List_of_rock_music_performers",
            "inds": [1, 28]
        },
        "country": {
            "url":
            "https://en.wikipedia.org/wiki/List_of_country_music_performers",
            "inds": [1, 27]
        },
        "metal": {
            "url": "https://en.wikipedia.org/wiki/List_of_heavy_metal_bands",
            "inds": []
        }
    }

    curr_page = params[curr_genre]
    r = requests.get(curr_page["url"])

    soup = BeautifulSoup(r.text, 'html.parser')

    if curr_genre in ["country", "rock"]:
        bands_list = soup.findAll(
            "ul")[curr_page["inds"][0]:curr_page["inds"][1]]
        bands_list = "".join([str(band) for band in bands_list])

    elif curr_genre in ["metal"]:
        bands_list = soup.find("table", {
            "class": "wikitable"
        }).tbody.findAll("tr")
        bands_list = ''.join([
            str(row.findAll("td")[0]) for row in bands_list
            if len(row.findAll("td")) > 0
        ])

    else:
        raise Exception("Invalid genre!")

    soup = BeautifulSoup(bands_list, "html.parser")
    hrefs = soup.findAll('a', href=True, title=True)

    urls = [a['href'] for a in hrefs]
    titles = [a['title'] for a in hrefs]

    if len(urls) != len(titles):
        raise Exception("Oops! There was a parsing error!")

    file_handler = DataFiles()
    file_handler.set_genre(curr_genre)
    file_handler.write_header()
    for i in range(len(urls)):
        file_handler.write_band(titles[i], urls[i])
コード例 #5
0
    def selectTeachers(self, filename):
        files = DataFiles()
        steps = files.readFile(filename)
        iterations = len(steps[0])
        tries = len(steps)
        finalRate = 0.01 #Considering the last 1% of performed actions

        totalFinalSteps = np.zeros(tries)
        minStep = iterations - int(iterations*finalRate)
        for i in range(tries):
            acum = 0
            for j in range(minStep, iterations):
                acum += steps[i][j]
            #endfor
            totalFinalSteps[i] = acum
        #endfor
            
        totalFinalStepsTidy = np.argsort(totalFinalSteps)
        bestAgent = totalFinalStepsTidy[0]
        medianAgent = totalFinalStepsTidy[int(tries/2)]
        worstAgent = totalFinalStepsTidy[tries-1]

        return bestAgent, medianAgent, worstAgent
コード例 #6
0
ファイル: AstroMerger.py プロジェクト: sstsalazar/AstroMerger
class AstroMerger():
    def __init__(self, origin, destination):
        self.origin = DataFiles(origin)
        self.destination = DataFiles(destination)
        self.changes = []
        self.move = []
        self.origin.computeChecksums()
        self.destination.computeChecksums()

    #def compareFiles(self):
    #o = self.origin.checksums
    #d = self.destination.checksums
    #for file in o.keys():
    #if file not in d:
    #self.newFiles.append(file)
    #else:
    #if o[file] != d[file]:
    #self.newVersions.append(file)

    def compareFiles(self):
        o = self.origin.checksums
        d = self.destination.checksums
        oldVersions = self.destination.checksums.keys()
        for file in o.keys():
            if file not in d or o[file] != d[file]:
                self.move.append(file)
                entry = {"Date": time.asctime(time.localtime(time.time()))}
                entry["File"] = file
                if file in oldVersions:
                    entry["Status"] = "ALTERED"
                else:
                    entry["Status"] = "NEW"
                self.changes.append(entry)

    def migrateFiles(self):
        currentDir = os.getcwd()
        self.origin.accessPath
        oldVersions = self.destination.checksums.keys()
        self.origin.accessPath()
        for f in os.listdir("."):
            if f in self.move:
                shutil.copy(f, self.destination.path)
            os.remove(f)
        os.chdir(currentDir)

    def registerChanges(self, logname="log.json"):
        with open(logname, 'w') as f:
            json.dump(self.changes,
                      f,
                      sort_keys=True,
                      indent=4,
                      ensure_ascii=False)
コード例 #7
0
# This program goes through every page in the current set of valid URLs and scan their pages for valid neighbor nodes
# This is a one-step BFS gathering of new nodes
# This is purely for data collection. No links are being established at this point

import requests
from bs4 import BeautifulSoup
from DataFiles import DataFiles
from confirm_page_structure import check_valid_page

base_url = "https://en.wikipedia.org/"

file_manager = DataFiles()

for url in file_manager.get_all_urls():
    r = requests.get(base_url + url)
    # r = requests.get("https://en.wikipedia.org/wiki/Christine_McVie")
    print(url)

    soup = BeautifulSoup(r.text, 'html.parser')
    main_content = soup.findAll("div", {"id": "content"})

    # If, for some bizarre reason, there's not exactly one id="content" div tag
    if len(main_content) != 1:
        main_content = soup
    else:
        main_content = main_content[0]

    for a in main_content.findAll("a", href=True):
        res = file_manager.test_valid_url(a["href"])
        if res == 0:
            # print(url)
コード例 #8
0
 def loadQValues(self, filename):
     files = DataFiles()
     self.Q = files.readFloatFile(filename)
コード例 #9
0
 def saveQValues(self, filename):
     files = DataFiles()
     files.createFile(filename)
     for i in range(self.numberOfStates):
         files.addFloatToFile(filename, self.Q[i])
コード例 #10
0
# This file goes through all genres

import requests
from bs4 import BeautifulSoup
from DataFiles import DataFiles

base_url = "https://en.wikipedia.org/"

file_manager = DataFiles()

for url in file_manager.get_all_urls():
    # url = "/wiki/Tears_for_Fears"
    r = requests.get(base_url + url)
    # r = requests.get("https://en.wikipedia.org/wiki/Christine_McVie")
    print(url)

    soup = BeautifulSoup(r.text, 'html.parser')
    main_content = soup.findAll("div", {"id": "content"})

    # If, for some bizarre reason, there's not exactly one id="content" div tag
    if len(main_content) != 1:
        main_content = soup
    else:
        main_content = main_content[0]

    # Ensure we only add one of each connection with a set
    add_urls = dict()

    for a in main_content.findAll("a", href=True):
        href = str(a["href"])
        res = file_manager.test_valid_url(href)
コード例 #11
0
---------
1) TRAINING: Almacenamos cada instancia de entrenamiento en una lista (training_list)
2) CLASSIFICATION
    2.1) Dada una instancia de la cual queremos predecir la clase
        2.1.1) obtenemos la distancia a cada una de las instancias ####lista_distancias<-getDistanciaACadaInstancia(instancia, m)
            |NO::::2.1.1.1) por cada instancia 'i' de la training_list: 
            |NO::::        lista_distancias<-calcularDistanciaMinkowski(instancia, training_list[i], m)
            |NO::::        return lista_distancias
        2.1.2) ordenar las distancias #####sort(lista_distancias)
        2.1.3) obtener las k-instancias con distancia minimas
    2.2) Etiquetamos la instancia con la clase mayoritaria
'''

if __name__ == '__main__':  #main
    start_time = datetime.now()
    data = DataFiles()          
    preproceso = Preproceso()
    evaluacion = Evaluacion()
    try:
        #1. LEER FICHERO
        inputPath = sys.argv[1];
        input_instances = data.obtenerInstanciasDeFichero(inputPath)  #1.2 cargamos el fichero
        input_instances.calcNumClases()
        #2. PREPROCESO
        preproceso.randomizarInstancias(input_instances)
        
        #3. kNN
        #70% instancias con clase, 30% instancias a clasificar
        k = int(sys.argv[2]) #num de vecinos a explorar
        m = float(sys.argv[3]) #m de la dist de Minkowski
        porcentaje = sys.argv[4]