Ejemplos de get_root en Python, ejemplos de util.get_root en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: codeforces.py Proyecto: dansen/algorithm-stone

 def __init__(self):
     self.dict = self.init_db()
     self.finished = []
     # read user
     p = util.get_root("user", "codeforces")
     entries = os.listdir(p)
     self.finished = entries

Ejemplo n.º 2

0

Mostrar archivo

def main():
    cache = get_cache()
    failed_uris = get_failed()
    parse_failed_uris = get_parse_failed()

    uris = cache.keys()

    peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]]
    peak_missing = set(peak_missing) - failed_uris

    # XXX: fetch_stream_infos is the same for each root url
    peak_missing = {get_root(uri) for uri in peak_missing}
    peak_missing = set(peak_missing) - parse_failed_uris
    

    pool = Pool(PROCESSES)
    try:
        pfunc = fetch_stream_infos
        for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)):
            uri, streams = res

            # save all 1000
            if (i+1) % 1000 == 0:
                set_cache(cache)

            print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ",
            print "%d new streams" % len(streams)

            if not streams:
                parse_failed_uris.add(uri)

            # add new found uris to cache + listener count
            for stream in streams:
                peak = str(int(stream.peak))
                current = str(int(stream.current))
                uri = stream.stream

                if uri not in cache:
                    cache[uri] = {}

                if LISTENERPEAK in cache[uri]:
                    cache[uri][LISTENERPEAK].append(peak)
                else:
                    cache[uri][LISTENERPEAK] = [peak]

                if LISTENERCURRENT in cache[uri]:
                    cache[uri][LISTENERCURRENT].append(current)
                else:
                    cache[uri][LISTENERCURRENT] = [current]

    except Exception as e:
        print e
    finally:
        set_parse_failed(parse_failed_uris)
        set_cache(cache)
        pool.terminate()
        pool.join()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: fetch_cast.py Proyecto: LudoBike/quodlibet

def main():
    cache = get_cache()
    failed_uris = get_failed()
    parse_failed_uris = get_parse_failed()

    uris = cache.keys()

    peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]]
    peak_missing = set(peak_missing) - failed_uris

    # XXX: fetch_stream_infos is the same for each root url
    peak_missing = {get_root(uri) for uri in peak_missing}
    peak_missing = set(peak_missing) - parse_failed_uris
    

    pool = Pool(PROCESSES)
    try:
        pfunc = fetch_stream_infos
        for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)):
            uri, streams = res

            # save all 1000
            if (i+1) % 1000 == 0:
                set_cache(cache)

            print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ",
            print "%d new streams" % len(streams)

            if not streams:
                parse_failed_uris.add(uri)

            # add new found uris to cache + listener count
            for stream in streams:
                peak = str(int(stream.peak))
                current = str(int(stream.current))
                uri = stream.stream

                if uri not in cache:
                    cache[uri] = {}

                if LISTENERPEAK in cache[uri]:
                    cache[uri][LISTENERPEAK].append(peak)
                else:
                    cache[uri][LISTENERPEAK] = [peak]

                if LISTENERCURRENT in cache[uri]:
                    cache[uri][LISTENERCURRENT].append(current)
                else:
                    cache[uri][LISTENERCURRENT] = [current]

    except Exception as e:
        print e
    finally:
        set_parse_failed(parse_failed_uris)
        set_cache(cache)
        pool.terminate()
        pool.join()

Ejemplo n.º 4

0

Mostrar archivo

 def __init__(self):
     self.dict = self.init_db()
     self.finishes = []
     self.flasks = []
     # read user
     p = util.get_root("user", "leetcode")
     entries = os.listdir(p)
     for k in entries:
         if k.endswith(".cpp"):
             self.finishes.append(k)
         elif k.endswith(".md"):
             self.flasks.append(k)

Ejemplo n.º 5

0

Mostrar archivo

def find_shortest_path(start_node: int):
    nodes = {
        index: (sys.maxsize, [index], UNVISITED)
        for index in neighbors.keys()
    }
    nodes[start_node] = (0, [start_node], FRONTIER)

    round = 0
    directory = os.path.realpath(
        os.path.join(get_root(), 'output/shortest', str(start_node)))
    if not os.path.exists(directory):
        os.mkdir(directory)

    with open(os.path.join(directory, str(round)), 'w',
              encoding='utf-8') as file:
        for key, value in nodes.items():
            file.write(f'{key},{value}\n')

    current = 0
    print(f'Searching from node {start_node}')
    print(f'Start {datetime.now()}')
    while True:
        done = True
        current_file = os.path.join(directory, str(round))
        print(current_file)
        job = ShortestPath(args=[current_file])
        with job.make_runner() as runner:
            runner.run()
            round += 1
            with open(os.path.join(directory, str(round)),
                      'w',
                      encoding='utf-8') as f:
                for key, row in job.parse_output(runner.cat_output()):
                    dist, path, state = row
                    f.write(f'{key},({dist}, {path}, {state})\n')
                    if state == UNVISITED or state == FRONTIER:
                        done = False
                    if state == FRONTIER:
                        current += 1

        print(f'{round} completed')

        if current == 0:
            print('No more nodes on the frontier, stopping')
            break

        current = 0

        if done:
            break

    print(f'Done {datetime.now()}')

Ejemplo n.º 6

0

Mostrar archivo

Archivo: models.py Proyecto: yyht/rrws

    def get_tree_log_prob(self,
                          tree,
                          obs_embedding=None,
                          previous_sample_embedding=None,
                          inference_hidden=None,
                          obs=None):
        """Log probability of tree given obs.

        Args:
            tree: list or string
            obs_embedding: tensor [obs_embedding_dim]
            previous_sample_embedding: tensor [sample_embedding_dim]
            inference_hidden: tensor [inference_hidden_dim]
            obs: sentence (list of strings) or ys (torch.tensor of shape [100])

        Returns: log_prob (scalar tensor)"""

        if obs_embedding is None:
            obs_embedding = self.get_obs_embedding(obs)

        if previous_sample_embedding is None:
            previous_sample_embedding = torch.zeros(
                (self.sample_embedding_dim, ))

        if inference_hidden is None:
            inference_hidden = torch.zeros((self.inference_hidden_dim, ))

        if isinstance(tree, list):
            non_terminal = tree[0]
            sample_address_embedding = util.get_sample_address_embedding(
                non_terminal, self.grammar['non_terminals'])
            inference_gru_output = self.get_inference_gru_output(
                obs_embedding, previous_sample_embedding,
                sample_address_embedding, inference_hidden)

            subtrees = tree[1:]
            production = [util.get_root(subtree) for subtree in subtrees]
            production_index = util.get_production_index(
                non_terminal, production, self.grammar['productions'])
            sample_embedding = self.get_sample_embedding(production_index)
            logits = self.get_logits_from_inference_gru_output(
                inference_gru_output, non_terminal)
            dist = Categorical(logits=logits)
            log_prob = dist.log_prob(torch.tensor(production_index))
            subtree_log_probs = [
                self.get_tree_log_prob(subtree, obs_embedding,
                                       sample_embedding, inference_gru_output)
                for subtree in subtrees
            ]
            return log_prob + sum(subtree_log_probs)
        else:
            return torch.zeros(())

Ejemplo n.º 7

0

Mostrar archivo

Archivo: models.py Proyecto: yyht/rrws

    def get_tree_log_prob(self, tree):
        """Log probability of tree.

        Args:
            tree: list of lists or string

        Returns: scalar tensor
        """

        if isinstance(tree, list):
            non_terminal = tree[0]
            subtrees = tree[1:]
            production = [util.get_root(subtree) for subtree in subtrees]
            production_index = util.get_production_index(
                non_terminal, production, self.grammar['productions'])
            dist = Categorical(logits=self.production_logits[non_terminal])
            log_prob = dist.log_prob(torch.tensor(production_index))
            subtree_log_probs = [
                self.get_tree_log_prob(subtree) for subtree in subtrees
            ]
            return log_prob + sum(subtree_log_probs)
        else:
            return torch.zeros(())

Ejemplo n.º 8

0

Mostrar archivo

Archivo: api.py Proyecto: estcube/Telemetry-Forwarding-Client

    @app.errorhandler(404)
    def not_found(e):
        return send_file(os.path.join(static_folder, "index.html"))

    @app.route("/api/static/<path:path>")
    def send_static(path):
        return send_from_directory("static", path)

    @app.route("/api/conf", methods=["POST"])
    def post_set_conf():
        some_json = request.get_json()
        try:
            for i in some_json:
                for j in some_json[i]:
                    config.set_conf(section=i, element=j, value=some_json[i][j])
        except:
            _, error, _ = sys.exc_info()
            return jsonify({"Error": "{err}".format(err=error)}), 500
        return jsonify(some_json), 200

    return app


if __name__ == "__main__":
    CONF_PATH = os.path.join(util.get_root(), "configuration.ini")
    conf = Configuration(CONF_PATH)
    STATIC_PATH = os.path.join(util.get_root(), conf.get_conf("Client", "static-files-path"))
    APP = create_app(conf, STATIC_PATH, None)
    APP.run(debug=True)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: telemetry_listener.py Proyecto: estcube/Telemetry-Forwarding-Client

import sys
import os
import logging
import json
from datetime import datetime
from enum import Enum
from typing import Callable
from ax_listener import AXFrame
from db_interface import TelemetryDB
import util

if getattr(sys, 'frozen', False):
    sys.path.append(os.path.join(util.get_root(), 'src'))


class TelemetryFrame:
    """ Data structure for the output of the telemetry listener that is sent to the repository. """
    def __init__(self, packet_timestamp: datetime, fields):
        self.timestamp = packet_timestamp
        self.fields = fields

    def __repr__(self):
        return (("Timestamp: {}; recv_timestamp: {}; fields: {}").format(
            self.timestamp, self.recv_timestamp, self.fields))


class TimestampType(Enum):
    """ Enum of the supported timestamp types. """
    unix = "unix_timestamp"

Ejemplo n.º 10

0

Mostrar archivo

import csv
import os
import sys
from util import get_root

csv_filename = sys.argv[1]

with open(os.path.join(get_root(), 'output', csv_filename),
          'r',
          encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='|')

    sorted_list = sorted(reader, key=lambda x: int(x[1]), reverse=True)

with open(os.path.join(get_root(), 'output', f'sorted_{csv_filename}'),
          'w',
          encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='|')
    for row in sorted_list:
        writer.writerow(row)

Ejemplo n.º 11

0

Mostrar archivo

from typing import List
import os

import pygame

import util

root = util.get_root()

class Text:
    def __init__(self, start: List[int], text: str, font, color):
        self.start = start
        self.text = font.render(text, True, color)

    def render(self, display):
        display.blit(self.text, self.start)

class FileText(Text):
    def __init__(self, start, text: str, font: str, size, color):
        font = pygame.font.Font(font, size)
        Text.__init__(self, start, text, font, color)


class ExoText(FileText):
    def __init__(self, start, text: str, size, color):
        FileText.__init__(self, start, text, os.path.join(root, "fonts/Exo/regular.ttf"), size, color)


class ExoTextLight(FileText):
    def __init__(self, start, text: str, size, color):
        FileText.__init__(self, start, text, os.path.join(root, "fonts/Exo/light.ttf"), size, color)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: degrees_to_csv.py Proyecto: OliverFlecke/Wikipedia-Network

import json
import os
import sys
from util import get_root

filename = sys.argv[1]

with open(os.path.join(get_root(), 'output', filename), 'r', encoding='utf-8') as f:
    degrees = json.load(f)

with open(os.path.join(get_root(), 'output', os.path.splitext(filename)[0] + '.csv'), 'w', encoding='utf-8') as f:
    f.write('x,y\n')
    for key, value in sorted(degrees.items(), key=lambda x: int(x[0])):
        f.write(f'{key},{value}\n')

Ejemplo n.º 13

0

Mostrar archivo

import os
import sys
from util import get_root
import csv

filename = sys.argv[1]

with open(os.path.join(get_root(), 'output', filename), 'r',
          encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='|')

    total_links = 0
    count = 0
    min_degree = sys.maxsize
    max_degree = -1

    for _, degree in reader:
        degree = int(degree)
        total_links += degree
        count += 1

        if degree < min_degree:
            min_degree = degree
        if degree > max_degree:
            max_degree = degree

with open(os.path.join(get_root(), 'output', f'statistics_{filename}'),
          'w',
          encoding='utf-8') as f:
    f.write(f'Total pages: {count}\n')
    f.write(f'Total links: {total_links}\n')

Ejemplo n.º 14

0

Mostrar archivo

Archivo: api.py Proyecto: estcube/Telemetry-Forwarding-Client

def create_app(config: Configuration, tnc_pool: TNCPool, sids_relay: SIDSRelay) -> Flask:
    """ Creates a flask app for the api. """

    log = logging.getLogger(__name__)

    static_folder = os.path.join(util.get_root(), config.get_conf("Client", "static-files-path"))

    app = Flask(__name__, static_url_path="", static_folder=static_folder)
    CORS(app)

    if not config.get_conf("Client", "debug-log"):
        server_log = logging.getLogger("werkzeug")
        server_log.setLevel(logging.WARN)

    # swagger specific
    swagger_url = "/api/docs"
    api_url = "/api/static/swagger.yaml"
    swaggerui_blueprint = get_swaggerui_blueprint(
        swagger_url,
        api_url,
        config={
            "app_name": "Estcube 2 Telemetry API"
        }
    )
    app.register_blueprint(swaggerui_blueprint, url_prefix=swagger_url)

    # end swagger specific

    @app.route("/api/sids/status", methods=["GET"])
    def get_sids_status():
        return jsonify(sids_relay.get_status()), 200

    @app.route("/api/sids/toggle", methods=["POST"])
    def toggle_relay():
        response_json = request.get_json()
        current_relay_status = response_json["Mission Control"]["relay-enabled"]
        config.set_conf(section="Mission Control", element="relay-enabled", value=current_relay_status)
        if current_relay_status:
            threading.Thread(target=sids_relay.relay_unrelayed_packets, daemon=True).start()
        return response_json, 200

    @app.route("/api/tnc/<name>/status", methods=["GET"])
    def get_tnc_connection_check(name: str):
        if tnc_pool is None:
            return jsonify({"error": "TNC Pool is not defined."}), 500

        res = tnc_pool.check_tnc(name)
        return jsonify({"name": name, "status": res.name}), 200

    @app.route("/api/tnc/Main/start", methods=["POST"])
    def post_tnc_main_start():
        if tnc_pool is None:
            return jsonify({"error": "TNC Pool is not defined."}), 500

        tnc_pool.connect_main_tnc()
        return "", 204

    @app.route("/api/tnc/<name>/stop", methods=["POST"])
    def post_tnc_connection_stop(name: str):
        if tnc_pool is None:
            return jsonify({"error": "TNC Pool is not defined."}), 500

        tnc_pool.stop_tnc(name)
        return "", 204

    @app.route("/api/conf", methods=["GET"])
    def getconf():
        """ Returns the whole current configuration object. """
        res = config.get_all_conf()
        return jsonify(res)

    @app.route("/api/conf/constraints", methods=["GET"])
    def get_constraints():
        """ Returns all of the constraints for the configuration. """
        constrs = config.get_constraints()
        return jsonify(constrs)

    @app.route("/api/conf/full", methods=["GET"])
    def get_full_conf():
        res = config.get_conf_with_constraints()
        return res

    @app.route("/", methods=["GET"])
    def get_index():
        return send_file(os.path.join(static_folder, "index.html"))

    @app.errorhandler(404)
    def not_found(e):
        return send_file(os.path.join(static_folder, "index.html"))

    @app.route("/api/static/<path:path>")
    def send_static(path):
        return send_from_directory("static", path)

    @app.route("/api/conf", methods=["POST"])
    def post_set_conf():
        some_json = request.get_json()
        try:
            for i in some_json:
                for j in some_json[i]:
                    config.set_conf(section=i, element=j, value=some_json[i][j])
        except:
            _, error, _ = sys.exc_info()
            return jsonify({"Error": "{err}".format(err=error)}), 500
        return jsonify(some_json), 200

    return app

Ejemplo n.º 15

0

Mostrar archivo

import matplotlib.pyplot as plt
import json
import os
import numpy as np
from util import get_root

with open(os.path.join(get_root(), 'output', 'degrees.json'),
          'r',
          encoding='utf-8') as f:
    degrees = json.load(f)

xs = [int(x) for x in degrees.keys()]

fig, ax = plt.subplots(1, 1, figsize=(20, 10))
ax.scatter(xs, degrees.values())
ax.set_xlabel('Degree')
ax.set_ylabel('Frequency')
ax.set_title('Degree distribution')
ax.set_xticks(np.arange(0, max(xs), step=200))

fig.savefig('degree_distrbution.png')

fig, ax = plt.subplots(1, 1, figsize=(20, 10))
ax.loglog(xs, degrees.values(), 'o')
ax.set_xlabel('Degree')
ax.set_ylabel('Frequency')
ax.set_title('Loglog Degree distribution')

fig.savefig('degree_distrbution_loglog.png')

Ejemplo n.º 16

0

Mostrar archivo

Archivo: find_degrees_in_distribution.py Proyecto: OliverFlecke/Wikipedia-Network

import sys
import os
from util import get_filename, get_root

# DTU's HPC won't install mrjob. Cloned repo and placed it locally
sys.path.insert(0, os.path.join(get_root(), 'mrjob'))
from mrjob.job import MRJob
from mrjob.step import MRStep
import mrjob.compat
from collections import Counter
import csv

data_path = os.path.join(get_root(), 'data')

class AverageInDegree(MRJob):

    def mapper(self, _, page):
        with open(os.path.join(data_path, 'links', get_filename(page)), mode='r', encoding='utf-8') as f:
            for link in f:
                yield link.strip(), 1

    def reducer_sum(self, link, values):
        yield 'link_in', sum(values)

    def reducer_freq(self, link, values):
        yield 'degrees', dict(Counter(values))

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                reducer=self.reducer_sum),

Ejemplo n.º 17

0

Mostrar archivo

Archivo: subgraphs.py Proyecto: OliverFlecke/Wikipedia-Network

def find_shortest_path(start_node: int):
    subgraph_no_counter = 0

    nodes = {index: (sys.maxsize, UNVISITED) for index in neighbors.keys()}
    nodes[start_node] = (subgraph_no_counter, FRONTIER)

    round = 0
    directory = os.path.realpath(
        os.path.join(get_root(), 'output/subgraphs', str(start_node)))
    if not os.path.exists(directory):
        os.mkdir(directory)

    current_file = os.path.join(directory, 'data')
    with open(current_file, 'w', encoding='utf-8') as file:
        for key, value in nodes.items():
            file.write(f'{key},{value}\n')

    current = 0
    print(f'Searching from node {start_node}')
    print(f'Start {datetime.now()}')

    while True:
        while True:
            done = True
            print(current_file)
            job = Subgraphs(args=[current_file])
            with job.make_runner() as runner:
                runner.run()
                round += 1
                with open(current_file, 'w', encoding='utf-8') as f:
                    for key, row in job.parse_output(runner.cat_output()):
                        subgraph_no, state = row
                        f.write(f'{key},({subgraph_no}, {state})\n')
                        if state == UNVISITED or state == FRONTIER:
                            done = False
                        if state == FRONTIER:
                            current += 1

            print(f'{round} completed')

            if current == 0:
                print('No more nodes on the frontier, stopping')
                break

            current = 0

            if done:
                break

        subgraph_no_counter += 1

        all_subgraphs_found = True

        with open(current_file, 'r', encoding='utf-8') as f:
            for line in f:
                m = re.match(regex, str(line))
                key = int(m.group('key'))
                state = int(m.group('state'))

                if (state == UNVISITED):
                    print(line)
                    with open(current_file, 'a', encoding='utf-8') as f:
                        f.write(f'{key},({subgraph_no_counter}, {FRONTIER})\n')
                    all_subgraphs_found = False
                    break

        if (all_subgraphs_found):
            break

    print(f'Done {datetime.now()}')

Ejemplo n.º 18

0

Mostrar archivo

Archivo: subgraphs.py Proyecto: OliverFlecke/Wikipedia-Network

root = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')

# DTU's HPC won't install mrjob. Cloned repo and placed it locally
sys.path.insert(0, os.path.join(root, 'mrjob'))
from mrjob.job import MRJob
from util import get_filename, get_root
import re
from datetime import datetime

UNVISITED = 0
FRONTIER = 1
VISITED = 2

indexes = {}

with open(os.path.join(get_root(), 'data', 'pages.txt'), 'r',
          encoding='utf-8') as f:
    counter = 0
    for line in f:
        indexes[counter] = line.rstrip('\n')
        counter += 1

neighbors = {}

with open(os.path.join(get_root(), 'data', 'graph_file'),
          'r',
          encoding='utf-8') as file:
    for line in file:
        key, values = line.split(':')
        values = values[1:-3]
        if values == '':

Ejemplo n.º 19

0

Mostrar archivo

Archivo: main.py Proyecto: estcube/Telemetry-Forwarding-Client

def main(argv):
    """ Main loop function. """
    """ Parse command line options """
    opts, args = getopt(argv, "vc:")
    conf_path = None

    for opt, arg in opts:
        if opt == "-c":
            conf_path = arg

    if conf_path is None:
        """ Default conf path """
        conf_path = "../configuration.ini"
    """ Create the configuration object """
    conf = Configuration(conf_path)
    """ Set up logging """
    if not conf.get_conf("Client", "debug-log"):
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.DEBUG)
    _logger = logging.getLogger(__name__)

    if not os.path.isdir("../logs"):
        os.mkdir("../logs")

    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
    handler = logging.FileHandler("../logs/system_logs.log")
    handler.setFormatter(formatter)

    logging.getLogger('').addHandler(handler)

    _logger.info("Using configuration from: %s", conf_path)
    """ Create the database object """
    db_loc = os.path.join(util.get_root(), conf.get_conf("Client", "database"))
    database = TelemetryDB(db_loc)
    database.init_db()
    """ Update grafana and kaitai configurations """
    if conf.get_conf("Client", "automatic-updating"):
        Updater(conf).checkForUpdates()
    """ Build the other components. """
    ax_listener = AXListener(conf)
    sids_relay = SIDSRelay(conf, database)
    telemetry_listener = TelemetryListener(database)
    file_logger = FileLogger(conf, conf.get_conf("Client", "logs"), "log")
    """ Create the flask app and start it in a forked process. """
    port = conf.get_conf("Client", "frontend-port")
    """ Set the handler for SIGTERM, so we can exit a bit more gracefully. """
    signal.signal(signal.SIGTERM, terminate_handler)
    """ Hook the callbacks to the ax_listener. """
    ax_listener.add_callback(database.insert_ax_frame)
    ax_listener.add_callback(sids_relay.relay)
    ax_listener.add_callback(file_logger.log_ax_frame)
    ax_listener.add_callback(telemetry_listener.receive)

    tnc_pool = TNCPool(conf, ax_listener)
    tnc_pool.connect_main_tnc()

    api_app = api.create_app(conf, tnc_pool, sids_relay)
    """ We set the daemon option to True, so that the client will quit once the other threads have
        finished because we don't have a good way of stopping the Flask app properly. """
    api_thread = Thread(target=api_app.run, kwargs={"port": port}, daemon=True)
    api_thread.start()
    _logger.info("For the GUI open localhost:{}".format(port))

    try:
        """ On windows, the KeyboardInterrupt doesn't break the join. """
        if platform.system() == "Windows":
            while api_thread.isAlive:
                api_thread.join(2)
        else:
            api_thread.join()
    except (KeyboardInterrupt, SystemExit):
        pass
    finally:
        tnc_pool.cleanup()

Ejemplo n.º 20

0

Mostrar archivo

Archivo: degree_in.py Proyecto: OliverFlecke/Wikipedia-Network

import sys
import os
from util import get_filename, get_root

# DTU's HPC won't install mrjob. Cloned repo and placed it locally
sys.path.insert(0, os.path.join(get_root(), 'mrjob'))
from mrjob.job import MRJob

data_path = os.path.join(get_root(), 'data')


class DegreeIn(MRJob):
    def mapper(self, _, page):
        with open(os.path.join(data_path, 'links', get_filename(page)),
                  mode='r',
                  encoding='utf-8') as f:
            for line in f:
                yield line.rstrip('\n'), 1

    def reducer(self, key, values):
        yield key, sum(values)


names_file = os.path.join(data_path, 'pages.txt')
nodes = sum(1 for _ in open(names_file))

job = DegreeIn(args=[names_file, '--jobconf', 'nodes=' + str(nodes)])
with job.make_runner() as runner:
    runner.run()
    with open(os.path.join(get_root(), 'output', 'degree_in.csv'),
              'w',