def _load_data(self, sample_set, sub_sampling): X = [] y = [] for _, sample in sample_set.iterrows(): sample_path = sample['PATH'].replace('.mp3', '.npy') sample_data = np.load(path.join(self.data_path, sample_path)) if sub_sampling: sample_data = utils.get_windows(sample=sample_data, window=self.window, window_size=self.window_size, num_windows=self.num_windows) X.extend(sample_data) y.extend([sample['TAGS']] * self.num_windows) else: center_sample = utils.get_windows(sample=sample_data, window='center', window_size=self.window_size, num_windows=1) X.append(center_sample[0]) y.append(sample['TAGS']) X = np.array(X) y = np.array(y) X = X.reshape(*X.shape, 1) return X, y
def get_trip_distance(self, trip: Iterable[str]) -> int: """ >>> Network().get_trip_distance([]) 0 >>> Network.from_distances_text( ... "London to Dublin = 464\\n" ... "London to Belfast = 518\\n" ... "Dublin to Belfast = 141\\n" ... ).get_trip_distance(['Dublin', 'London', 'Belfast']) 982 """ return sum(self.distances[(lhs, rhs)] for lhs, rhs in get_windows(trip, 2))
def online_identification(k, w, window_size, train, test): click.echo('\n--- Online flight degradation identification ---') train_csv = load_csvs(train, should_preprocess=False) test_csv = load_csvs(test, should_preprocess=False) click.echo(' - k : %d ' % k) click.echo(' - w : %d ' % w) click.echo(' - train : %s ' % train) click.echo(' - test : %s ' % test) click.echo('\nRunning...\n') # For Test last_second = test_csv[0]['data']['seconds'].iloc[-1] #print('[TEST] Last second: {}'.format(last_second)) for time_window in range(0, last_second, window_size): click.echo('Time window is: %d ' % time_window) train_window = get_windows(train_csv, time_window, time_window + window_size - 1) test_window = get_windows(test_csv, time_window, time_window + window_size - 1) train_data, train_label = window_to_lists(train_window, 'pose_position_z') test_data, test_label = window_to_lists(test_window, 'pose_position_z') model = KnnDtw(k_neighbours = k, max_warping_window = w) model.fit(np.array(train_data), np.array(train_label)) #click.echo(train_label) predicted_label, probability = model.predict(test_data, parallel=True) click.echo('Predicted label : %s ' % str(predicted_label)) click.echo('\n') click.echo('\nDone.')
def at_least_one_letter_twice_in_a_row(self) -> bool: """ >>> String("xx").at_least_one_letter_twice_in_a_row() True >>> String("abcdde").at_least_one_letter_twice_in_a_row() True >>> String("aabbccdd").at_least_one_letter_twice_in_a_row() True >>> String("jchzalrnumimnmhp").at_least_one_letter_twice_in_a_row() False """ for first, second in get_windows(self.string, 2): if first == second: return True return False
def get_arrangement_happiness(self, arrangement: Tuple[str, ...]) -> int: """ >>> AttendeeSet({('a', 'b'): 2, ('b', 'a'): 2, ('b', 'c'): -1, ... ('c', 'a'): 5})\\ ... .get_arrangement_happiness(('a', 'b')) 2 >>> AttendeeSet({('a', 'b'): 2, ('b', 'c'): -1, ('c', 'a'): 5})\\ ... .get_arrangement_happiness(('a', 'b', 'c')) 6 """ if len(arrangement) < 2: return 0 elif len(arrangement) == 2: lhs, rhs = arrangement return self.happiness_map[(lhs, rhs)] return sum( self.happiness_map[(lhs, rhs)] for lhs, rhs in get_windows(arrangement + arrangement[:1], 2) )
def get_minimum_step_count( self, start: int = 0, return_to_start: bool = False, debugger: Debugger = Debugger(enabled=False), ) -> int: """ >>> Graph({ ... (0, 1): 2, (1, 0): 2, ... (0, 4): 2, (4, 0): 2, ... (1, 2): 6, (2, 1): 6, ... (2, 3): 2, (3, 2): 2, ... (3, 4): 8, (4, 3): 8, ... }).get_minimum_step_count() 14 """ nodes = self.get_nodes() if start not in nodes: raise Exception(f"Start {start} is not in nodes {nodes}") other_nodes = set(nodes) - {start} prefix = (start, ) if return_to_start: suffix = prefix else: suffix = () visit_orders = (prefix + permutation + suffix for permutation in itertools.permutations(other_nodes)) min_distance = None trip_distances_cache = {} for visit_order in debugger.stepping(visit_orders): distance = sum( self.get_shortest_distance( node_a, node_b, nodes, trip_distances_cache=trip_distances_cache) for node_a, node_b in get_windows(visit_order, 2)) if min_distance is None or distance < min_distance: min_distance = distance if debugger.should_report(): debugger.default_report(f"min distance: {min_distance}") return min_distance
def get_trip_distance( self, trip: Iterable[int], trip_distances_cache: Optional[Dict[Tuple[int, ...], int]] = None, ) -> Optional[int]: """ >>> Graph({}).get_trip_distance(()) >>> Graph({}).get_trip_distance((1,)) >>> Graph({}).get_trip_distance((1, 2)) >>> Graph({(1, 3): 3, (3, 2): 4}).get_trip_distance((1, 2)) >>> Graph({(2, 1): 3}).get_trip_distance((1, 2)) >>> Graph({(1, 2): 3}).get_trip_distance((1, 2)) 3 >>> Graph({(1, 2): 3, (2, 4): 4}).get_trip_distance((1, 2, 3)) >>> Graph({(1, 2): 3, (3, 2): 4}).get_trip_distance((1, 2, 3)) >>> Graph({}).get_trip_distance((1, 2, 3)) >>> Graph({(1, 2): 3, (2, 3): 4}).get_trip_distance((1, 2, 3)) 7 """ if trip_distances_cache is None: trip_distances_cache = {} if trip in trip_distances_cache: return trip_distances_cache[trip] total_distance = 0 at_least_2_nodes = False for pair in get_windows(trip, 2): pair: Tuple[int, int] distance = self.edges.get(pair) if distance is None: trip_distances_cache[trip] = None return None at_least_2_nodes = True total_distance += distance if not at_least_2_nodes: trip_distances_cache[trip] = None return None trip_distances_cache[trip] = total_distance return total_distance
def get_keys(self, _count: Optional[int] = 64, start: int = 0, window_size: int = 1000, debug: bool = False) -> Iterable[int]: windows = get_windows(self.get_all_hashes(start), window_size + 1) key_count = 0 if debug: timer = Timer() for index, window in enumerate(windows): _hash, next_hashes = window[0], window[1:] if self.is_hash_a_key(_hash, next_hashes): key_count += 1 yield key_count, index, _hash if _count is not None and key_count >= _count: break if debug: if index % 1000 == 0: print(f"Count: {index}, time: " f"{timer.get_pretty_current_duration()}, key count: " f"{key_count}/{_count}")
def compute_labels(data, dataarray, w, window_size, coordinate, should_plot): click.echo('--- Compute labels ---') desired_csv = load_csvs(data, should_preprocess=False) data_csv = load_csvs(dataarray, should_preprocess=False) for index, flight_data_number in enumerate(data_csv): flight_data_number['data']['flight_number'] = index # For Test last_second = desired_csv[0]['data']['seconds'].iloc[-1] #click.echo(' - data : %s ' % data) click.echo(' - dataarray : %s ' % dataarray) click.echo(' - w : %d ' % w) click.echo('\nRunning...') # For plot-pdfs title ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H%M%S') flight_dict = {} #flight_full_data = pd.DataFrame() flight_dataframe = pd.DataFrame() for time_window in range(0, last_second, window_size): click.echo('Time window is: %d ' % time_window) desired_window = get_windows(desired_csv, time_window, time_window + window_size - 1) data_window = get_windows(data_csv, time_window, time_window + window_size - 1) #print(data_window) # Compute all DTWs desired_flight = desired_window[0]['window'][coordinate].tolist() dtws = [] for index, flight_window in enumerate(data_window): # Compute DTW of window with desired_window ; window[coordinate].tolist() pos_data = flight_window['window'][coordinate].tolist() flight_seconds = flight_window['window']['seconds'].tolist() dtw = dtw_distance(desired_flight, pos_data, w) # Add key DTW to window ; window['dtw'] flight_window['dtw'] = dtw # Track all DTWS ; append dtw value dtws.append( dtw ) #click.echo('Done. Plots have been saved.') print(dtw) if should_plot: # plotting desired data with each flight for each window plt.plot(desired_flight, label='Desired') plt.plot(pos_data, label='Unlabelled') #plt.xlim(0,15) plt.ylim(0,2.1) plt.xlabel('Seconds') plt.ylabel('Meters') plt.title('Desired and unlabelled data. DTW = {}'.format(dtw)) plt.legend() plt.grid(True) plt.savefig('plots/z/desired_each_flight/dtwValues_{}_{}_{}.jpeg'.format(st, time_window, index)) plt.cla() print(dtws) if should_plot: print(dtws) dtws = [550 if x==9.223372036854776e+18 else x for x in dtws] # plotting dtw values of all flights for each window plt.figure(figsize=(20, 10)) plt.plot(dtws, color='#006600', alpha=.9) # marker='o', linestyle='--', plt.ylim([0, 600]) flights_x = [i for i in range(0, 53)] for flight_x, dtw in zip(flights_x, dtws): plt.text(flight_x, dtw, str(dtw)) plt.grid(True) #plt.show() plt.savefig('plots/z/dtw_values/dtwValues_{}_{}.pdf'.format(st, time_window)) plt.cla() click.echo('Done. Plots have been saved.') # Loop again and update labels for flight_window in data_window: if flight_window['dtw'] < 100: flight_window['label'] = 1 elif flight_window['dtw'] < 299: flight_window['label'] = 2 else: flight_window['label'] = 3 # Save to flight_dict fn = flight_window['window']['flight_number'].iloc[0] flight_full_data = flight_window['window'] flight_full_data.is_copy = False flight_full_data['label'] = flight_window['label'] flight_full_data['dtw'] = flight_window['dtw'] if fn not in flight_dict.keys(): flight_dict[fn] = flight_full_data # flight_window['window'] else: flight_dict[fn] = pd.concat([flight_dict[fn], flight_full_data]) # flight_window['window'] for flight_index in flight_dict: flight_dataframe = pd.DataFrame(flight_dict[flight_index]) flight_dataframe.to_csv('data/labelled/z/f_{}.csv'.format(flight_index), sep=';', encoding='utf-8') print('f_{}.csv saved!'.format(flight_index)) click.echo('\nDone.')
def lda(path): MIN_COUNTS = 20 MAX_COUNTS = 1800 # words with count < MIN_COUNTS # and count > MAX_COUNTS # will be removed MIN_LENGTH = 15 # minimum document length # (number of words) # after preprocessing # half the size of the context around a word HALF_WINDOW_SIZE = 5 # it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH nlp = spacy.load('en') dataset = fetch_data_groups(data_home=path) docs = dataset['data'] paths = dataset['filenames'] filenames = [path.split("/")[-1] for i, path in enumerate(paths)] docs = [(i, doc) for i, doc in enumerate(docs)] encoded_docs, decoder, word_counts = preprocess(docs, nlp, MIN_LENGTH, MIN_COUNTS, MAX_COUNTS) # new ids will be created for the documents. # create a way of restoring initial ids: doc_decoder = {i: doc_id for i, (doc_id, doc) in enumerate(encoded_docs)} filename_decoder = { i: filenames[encoded_docs[i][0]] for i in range(len(encoded_docs)) } data = [] # new ids are created here for index, (_, doc) in enumerate(encoded_docs): windows = get_windows(doc, HALF_WINDOW_SIZE) # index represents id of a document, # windows is a list of (word, window around this word), # where word is in the document data += [[index, w[0]] + w[1] for w in windows] data = np.array(data, dtype='int64') word_counts = np.array(word_counts) unigram_distribution = word_counts / sum(word_counts) vocab_size = len(decoder) embedding_dim = 50 # train a skip-gram word2vec model texts = [[str(j) for j in doc] for i, doc in encoded_docs] model = models.Word2Vec(texts, size=embedding_dim, window=5, workers=4, sg=1, negative=15, iter=70) model.init_sims(replace=True) word_vectors = np.zeros((vocab_size, embedding_dim)).astype('float32') for i in decoder: word_vectors[i] = model.wv[str(i)] texts = [[decoder[j] for j in doc] for i, doc in encoded_docs] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] n_topics = 2 lda = models.LdaModel(corpus, alpha=0.9, id2word=dictionary, num_topics=n_topics) corpus_lda = lda[corpus] doc_weights_init = np.zeros((len(corpus_lda), n_topics)) for i in range(len(corpus_lda)): topics = corpus_lda[i] for j, prob in topics: doc_weights_init[i, j] = prob print(os.getcwd()) np.save('newsgroups/utils/npy/data.npy', data) np.save('newsgroups/utils/npy/word_vectors.npy', word_vectors) np.save('newsgroups/utils/npy/unigram_distribution.npy', unigram_distribution) np.save('newsgroups/utils/npy/decoder.npy', decoder) np.save('newsgroups/utils/npy/doc_decoder.npy', doc_decoder) np.save('newsgroups/utils/npy/doc_weights_init.npy', doc_weights_init) np.save('newsgroups/utils/npy/filename_decoder.npy', filename_decoder) lst = [] for i, topics in lda.show_topics(n_topics, formatted=False): lst.append('topic' + str(i) + ':' + ' '.join([t for t, _ in topics])) return lst
class Password(str): FORBIDDEN_LETTERS = PASSWORD_FORBIDDEN_LETTERS THREE_INCREASING_LETTERS_SEQUENCES = [ "".join(letters) for letters in get_windows(string.ascii_lowercase, 3) if not set(letters) & set(PASSWORD_FORBIDDEN_LETTERS) ] NEXT_LETTER_AND_OVERFLOWED = { letter: (next_letter, next_letter == 'a') for letter, next_letter in ( (letter, (next_letter if next_letter not in PASSWORD_FORBIDDEN_LETTERS else alternate_next_letter)) for letter, next_letter, alternate_next_letter in zip( tuple(string.ascii_lowercase), tuple(string.ascii_lowercase)[1:] + ('a', ), tuple(string.ascii_lowercase)[2:] + ('a', 'b'), )) } re_two_instances_of_two_repeated_letters = re.compile(r"(\w+)\1.*(\w)\2") def get_next_password( self, debugger: Debugger = Debugger(enabled=False), ) -> 'Password': """ >>> Password('abcdefgh').get_next_password() 'abcdffaa' >>> Password('ghijklmn').get_next_password() 'ghjaabcc' """ debugger.reset() for password in debugger.stepping(self.get_next_password_candidates()): if password.is_valid(): return password debugger.default_report_if(password) def get_next_password_candidates(self) -> Iterable['Password']: """ >>> # noinspection PyUnresolvedReferences >>> [password for password, _ ... in zip(Password('xx').get_next_password_candidates(), range(6))] ['xy', 'xz', 'ya', 'yb', 'yc', 'yd'] """ current = self while True: current = current.get_next_password_candidate() yield current def get_next_password_candidate(self) -> 'Password': """ >>> Password('xx').get_next_password_candidate() 'xy' >>> Password('xy').get_next_password_candidate() 'xz' >>> Password('xz').get_next_password_candidate() 'ya' """ position = -1 result = list(self) while True: letter = result[position] next_letter, overflow = self.NEXT_LETTER_AND_OVERFLOWED[letter] result[position] = next_letter if not overflow: break position -= 1 if -position > len(result): raise Exception(f"Password overflowed: {''.join(result)}") cls = type(self) return cls(''.join(result)) def is_valid(self) -> bool: """ >>> Password('abcdefgh').is_valid() False >>> Password('abcdffaa').is_valid() True >>> Password('ghijklmn').is_valid() False >>> Password('ghjaabcc').is_valid() True """ return (self.has_three_increasing_letters() and self.does_not_include_forbidden_letters() and self.contains_two_instances_of_two_repeated_letters()) def has_three_increasing_letters(self) -> bool: """ >>> Password('hijklmmn').has_three_increasing_letters() False >>> Password('abbceffg').has_three_increasing_letters() False """ return any(sequence in self for sequence in self.THREE_INCREASING_LETTERS_SEQUENCES) def does_not_include_forbidden_letters(self) -> bool: """ >>> Password('hijklmmn').does_not_include_forbidden_letters() False """ return not any(letter in self for letter in self.FORBIDDEN_LETTERS) def contains_two_instances_of_two_repeated_letters(self) -> bool: """ >>> Password('abbceffg')\\ ... .contains_two_instances_of_two_repeated_letters() True >>> Password('abbcegjk')\\ ... .contains_two_instances_of_two_repeated_letters() False """ return bool( self.re_two_instances_of_two_repeated_letters.findall(self))