Example #1
0
def preprocess_tokens(docs, nlp):
    """Filter out noisy tokens and lemmatize the remaining ones.

    :param docs: list
        list of documents to be parsed
    :param nlp: spacy.lang.<code>.<language>
        spacy language, e.g. spacy.lang.es.Spanish
    :return filtered_tokens: list
        list of lists of lemmatized and filtered tokens
    """

    try:
        docs = list(docs)
    except TypeError:
        print("Input can't be casted to type 'list'")
        raise

    n = len(docs)  # used for progress bar only
    filtered_tokens = []
    for i, doc in enumerate(nlp.pipe(docs)):
        tokens = [
            token.lemma_.lower() for token in doc
            if (remove_noise(token) and token.lemma_ != '-PRON-')
        ]
        filtered_tokens.append(tokens)

        print_progressbar(i, n)

    return filtered_tokens
Example #2
0
def map_imgs_to_classes(filenames, dataset_name):
    X = []
    y = []
    for index, filename in enumerate(filenames):
        X.append(get_img_data('{}/{}'.format(slices_path, filename)))
        y.append(classes[filename[:3]])
        print_progressbar(index / len(filenames),
                          'Building {} dataset'.format(dataset_name))
    return np.array(X), pd.get_dummies(y).values
Example #3
0
def slice_spectrograms():
    all_spectrograms = listdir(spectrograms_path)

    if not path.exists(slices_path):
        makedirs(slices_path)

    for index, filename in enumerate(all_spectrograms):
        if filename.endswith('.png'):
            slice_spectrogram(filename)
            print_progressbar(index / len(all_spectrograms), 'Slicing')
Example #4
0
    def run(self):
        cmd_args = [
            item for sublist in self._get_command() for item in sublist
        ]

        proc = subprocess.Popen(['ffmpeg', '-hide_banner', '-y', *cmd_args],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                universal_newlines=True)

        while True:
            line = proc.stdout.readline()

            if not line:
                break

            line = str(line.rstrip())

            if self._duration is None:
                match = re.search(r'Duration: ([^,]+),', line)
                if match:
                    self._duration = parse_time(match[0])

            # if self._frames is None:
            #     match = re.search(r', ([\d.]+) tbr', line, flags=re.IGNORECASE)
            #     if match:
            #         self._frames = float(match[1])

            if line.startswith('frame'):
                status = re.search(
                    r'frame=([\d\s]+).*size=([\d\skmB]+).*time=([\d:.]+).*speed=([\s\d.]+)x',
                    line,
                    flags=re.IGNORECASE)

                time = self._get_fixed_time_on_run(parse_time(status[3]))
                speed = float(status[4].strip())

                print_progressbar(time,
                                  self._duration,
                                  suffix=f"(speed: {speed}x)")
        print()
Example #5
0
def render_hourly(session):
    date = START_DATE
    total_steps = (END_DATE - START_DATE) / STEP
    i = 0
    while date < END_DATE:
        i += 1
        print_progressbar(i / total_steps)
        graph = nx.MultiDiGraph()
        start = date
        end = (date + STEP)
        result = session.run(
            """
            MATCH (a:Station)-[r:BIKE_MOVED]->(b:Station)
            WHERE {start} <= r.timestamp_start < {end}
            RETURN a, r, b""", {
                'start': start.timestamp(),
                'end': end.timestamp()
            })
        for record in result:
            station_a = record['a']['name'].replace('/', ' /\n')
            station_b = record['b']['name'].replace('/', ' /\n')
            bike_id = record['r']['bike_id']
            start_time = datetime.fromtimestamp(
                record['r']['timestamp_start']).strftime('%H\:%M')
            end_time = datetime.fromtimestamp(
                record['r']['timestamp_end']).strftime('%H\:%M')
            label = f'{start_time} -\n{end_time}'
            color = 'red' if record['r']['transporter'] else '#aaaaaa'
            penwidth = 2 if record['r']['transporter'] else 1
            graph.add_edge(station_a,
                           station_b,
                           label=label,
                           color=color,
                           penwidth=penwidth)
            # graph.add_edge(station_a, station_b, label=bike_id)
        filename = f"{start.strftime('%Y-%m-%d_%H_%M')} - {end.strftime('%Y-%m-%d_%H_%M')}.dot"
        write_dot(graph, os.path.join(OUTPUT_DIRECTORY, filename))
        date = end
    clear_progressbar()
Example #6
0
def generate_spectrograms():
    current_path = path.dirname(path.realpath(__file__))
    filenames = glob.glob('{}/**/*.wav'.format(dataset_path))

    for index, filename in enumerate(filenames):
        categories = re.compile('(?:\[((?:[a-z]{3}(?:_)?)+)\])').findall(
            filename)
        newname = '{}/{}_{}.png'.format(spectrograms_path,
                                        '_'.join(categories), index)
        cmd = 'sox {} -n spectrogram -Y {} -m -r -o {}'.format(
            filename, slice_size, newname)
        p = Popen(cmd,
                  shell=True,
                  stdin=PIPE,
                  stdout=PIPE,
                  stderr=STDOUT,
                  close_fds=True,
                  cwd=current_path)
        output, errors = p.communicate()
        if errors:
            print(errors)
        print_progressbar(index / len(filenames), 'Generating')
Example #7
0
def create_bikes(number_of_samples=1):
    track_a_bike = TrackABike()
    fieldnames = ['number', 'version', 'marke_id', 'marke_name', 'is_pedelec']
    headernames = {
        'number': 'bike_id:ID(Bike)',
        'version': 'version:INT',
        'marke_id': 'marke_id:INT',
        'is_pedelec': 'is_pedelec:BOOLEAN'
    }
    bikes = {}
    i = 0
    for timestamp, data in read_xml_dumps():
        i += 1
        # We want to build a list of all bikes. Since some bikes may be rented or are even in maintenance,
        # it is not enough to look at a given moment. To save time, we just don't need to look at every
        # minute, so we just process a dataset every hour
        if i % 60:
            continue
        print_progressbar(i / number_of_samples)
        track_a_bike.load_xml(data)
        for station in track_a_bike.stations.values():
            # print(station['free_bikes'])
            update = {}
            for bike in station['free_bikes']:
                update[bike['number']] = {
                    headernames.get(key, key): bike[key]
                    for key in fieldnames
                }
            bikes.update(update)
            # bikes.update({free_bikes['number']: free_bikes[key] for key in fieldnames})
    with open(os.path.join(CSV_DIRECTORY, 'bikes.csv'), 'w') as f:
        writer = csv.DictWriter(f, [headernames.get(x, x) for x in fieldnames])
        writer.writeheader()
        bikes_list = list(bikes.values())
        bikes_list.sort(key=lambda x: x['bike_id:ID(Bike)'])
        writer.writerows(bikes_list)
Example #8
0
def create_bike_positions_and_movement(number_of_samples=0):
    fieldnames_position = [
        'number', 'timestamp', 'can_be_rented', 'can_be_returned', 'station_id'
    ]
    headernames_position = {
        'number': ':START_ID(Bike)',
        'station_id': ':END_ID(Station)',
        'timestamp': 'timestamp:INT',
        'can_be_rented': 'can_be_rented:BOOLEAN',
    }
    fieldnames_movement = [
        ':START_ID(Station)', ':END_ID(Station)', 'timestamp_start:INT',
        'timestamp_end:INT', 'duration:INT', 'bike_id:INT'
    ]
    with open(os.path.join(CSV_DIRECTORY, 'bike_positions.csv'), 'w') as f:
        with open(os.path.join(CSV_DIRECTORY, 'bike_movements.csv'),
                  'w') as f2:
            position_writer = csv.DictWriter(
                f,
                [headernames_position.get(x, x) for x in fieldnames_position])
            movement_writer = csv.DictWriter(f2, fieldnames_movement)
            position_writer.writeheader()
            movement_writer.writeheader()
            i = 0
            track_a_bike = TrackABike()
            current_bike_positions = {}
            for timestamp, data in read_xml_dumps():
                i += 1
                print_progressbar(i / number_of_samples)
                track_a_bike.load_xml(data)
                for station in track_a_bike.stations.values():
                    bike_positions = []
                    for bike in station['free_bikes']:
                        bike_id = bike['number']
                        prev_station = current_bike_positions.get(
                            bike_id, None)
                        if prev_station is not None and prev_station[
                                'id'] != station['id']:
                            duration = (timestamp - prev_station['timestamp'])
                            movement_writer.writerow({
                                ':START_ID(Station)':
                                prev_station['id'],
                                ':END_ID(Station)':
                                station['id'],
                                'timestamp_start:INT':
                                int(prev_station['timestamp'].timestamp()),
                                'timestamp_end:INT':
                                int(timestamp.timestamp()),
                                'duration:INT':
                                int(duration.total_seconds()),
                                'bike_id:INT':
                                bike_id,
                            })
                        current_bike_positions[bike_id] = {
                            'id': station['id'],
                            'timestamp': timestamp
                        }
                        bike_position = {
                            headernames_position.get(key, key):
                            bike.get(key, None)
                            for key in fieldnames_position
                        }
                        bike_position[
                            headernames_position['station_id']] = station['id']
                        bike_position[headernames_position['timestamp']] = int(
                            timestamp.timestamp())
                        bike_positions.append(bike_position)
                    position_writer.writerows(bike_positions)