def convertToMatrix(self, filename): negOrder = -1 * (self.order) set1 = OrderedSet() for k, v in self.d.items(): for x, y in v.items(): # s = x[negOrder:] toState = k fromState = x set1.add(toState) set1.add(fromState) self.array = np.zeros(shape=(len(set1), len(set1))) for k, v in self.d.items(): for x, y in v.items(): summation = 0 for m, n in self.d.items(): for p, q in n.items(): if (p == x): summation += q # s = x[negOrder:] toState = k fromState = x self.array[set1.index(fromState)][set1.index(toState)] = y / summation print(len(set1)) # dict1 = {} # pd.DataFrame(self.array).to_csv("yash.csv") # for row in self.array: # for item in set1: # dict1.update({item:row}) df = pd.DataFrame(self.array) df.to_csv(filename)
def _get_pmx_crossed_sequence(sequence_a: OrderedSet, sequence_b: OrderedSet, part_from_a: OrderedSet, part_from_b: OrderedSet, start_index: int, end_index: int) -> List[int]: """ Returns a sequence, which base is from 'sequence_b' and 'part_from_a' is copied in """ new_sequence = list(sequence_b) elements_requiring_correction = {} uniques_from_b_part = part_from_b - part_from_a for unique_from_b_part in uniques_from_b_part: index_in_part = part_from_b.index(unique_from_b_part) elements_requiring_correction[unique_from_b_part] = part_from_a[index_in_part] for elem_from_b, elem_from_a in elements_requiring_correction.items(): while elem_from_a in part_from_b: index_of_elem_from_b = sequence_b.index(elem_from_a) elem_from_a = sequence_a[index_of_elem_from_b] new_index = sequence_b.index(elem_from_a) new_sequence[new_index] = elem_from_b new_sequence[start_index:end_index] = part_from_a return new_sequence
def getPaper(url): try: article = quickSoup(url) t = article.get_text() if "The abstract you requested was not found" in t: return ("{},".format(url)) title = article.find('h1').get_text().replace("\n", "") test_list = OrderedSet(t.split("\n")) authors = test_list[0].replace(title, "").replace(" :: SSRN", "").replace( " by ", "").replace(", ", ":") date = [ line.replace("Last revised: ", "") for line in test_list if "Last revised: " in line ] if date == []: date = [ line.replace("Posted: ", "") for line in test_list if "Last revised: " in line ] date = date[0] text = t.split("Abstract\n")[1] abstract = "\"{}\"".format( text.split("Suggested Citation:")[0].replace("\n", "")) # get paper statistics stats = OrderedSet( article.find('div', attrs={ 'class': 'box-paper-statics' }).get_text().split("\n")) views, dl, rank, refs = "", "", "", "" try: views = stats[stats.index('Abstract Views') + 1].strip().replace( ",", "") except: pass try: dl = stats[stats.index('Downloads') + 1].strip().replace(",", "") except: pass try: rank = stats[stats.index('rank') + 1].strip().replace(",", "") except: pass try: refs = stats[stats.index('References') + 1].strip().replace( ",", "") except: pass results = [ url, "\"{}\"".format(title), abstract, authors, date, views, dl, rank, refs ] return (",".join(results)) except: return ("{},,,,,,,,".format(url))
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True): """ Read a file of tab-separated association data from ConceptNet, such as `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations, and a pandas Index of labels. If you specify `orig_index`, then the index of labels will be pre-populated with existing labels, and any new labels will get index numbers that are higher than the index numbers the existing labels use. This is important for producing a sparse matrix that can be used for retrofitting onto an existing dense labeled matrix (see retrofit.py). """ mat = SparseMatrixBuilder() labels = OrderedSet(orig_index) totals = defaultdict(float) with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip( ).split('\t') index1 = labels.add(replace_numbers(concept1)) index2 = labels.add(replace_numbers(concept2)) value = float(value_str) mat[index1, index2] = value mat[index2, index1] = value totals[index1] += value totals[index2] += value # Link nodes to their more general versions for label in labels: prefixes = list(uri_prefixes(label, 3)) if len(prefixes) >= 2: parent_uri = prefixes[-2] if parent_uri in labels: index1 = labels.index(label) index2 = labels.index(parent_uri) mat[index1, index2] = 1 mat[index2, index1] = 1 totals[index1] += 1 totals[index2] += 1 # add self-loops on the diagonal with equal weight to the rest of the row if self_loops: for key, value in totals.items(): mat[key, key] = value shape = (len(labels), len(labels)) index = pd.Index(labels) return normalize(mat.tocsr(shape), norm='l1', axis=1), index
def test_indexing(): set1 = OrderedSet('abracadabra') assert set1[:] == set1 assert set1.copy() == set1 assert set1 is set1 assert set1[:] is not set1 assert set1.copy() is not set1 assert set1[[1, 2]] == OrderedSet(['b', 'r']) assert set1[1:3] == OrderedSet(['b', 'r']) assert set1.index('b') == 1 assert set1.index(['b', 'r']) == [1, 2] with pytest.raises(KeyError): set1.index('br')
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True): """ Read a file of tab-separated association data from ConceptNet, such as `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations, and a pandas Index of labels. If you specify `orig_index`, then the index of labels will be pre-populated with existing labels, and any new labels will get index numbers that are higher than the index numbers the existing labels use. This is important for producing a sparse matrix that can be used for retrofitting onto an existing dense labeled matrix (see retrofit.py). """ mat = SparseMatrixBuilder() labels = OrderedSet(orig_index) totals = defaultdict(float) with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip().split('\t') index1 = labels.add(replace_numbers(concept1)) index2 = labels.add(replace_numbers(concept2)) value = float(value_str) mat[index1, index2] = value mat[index2, index1] = value totals[index1] += value totals[index2] += value # Link nodes to their more general versions for label in labels: prefixes = list(uri_prefixes(label, 3)) if len(prefixes) >= 2: parent_uri = prefixes[-2] if parent_uri in labels: index1 = labels.index(label) index2 = labels.index(parent_uri) mat[index1, index2] = 1 mat[index2, index1] = 1 totals[index1] += 1 totals[index2] += 1 # add self-loops on the diagonal with equal weight to the rest of the row if self_loops: for key, value in totals.items(): mat[key, key] = value shape = (len(labels), len(labels)) index = pd.Index(labels) return mat.tocsr(shape), index
def test_indexing(): set1 = OrderedSet('abracadabra') eq_(set1[:], set1) eq_(set1.copy(), set1) assert set1[:] is set1 assert set1.copy() is not set1 eq_(set1[[1, 2]], OrderedSet(['b', 'r'])) eq_(set1[1:3], OrderedSet(['b', 'r'])) eq_(set1.index('b'), 1) eq_(set1.index(('b', 'r')), [1, 2]) try: set1.index('br') assert False, "Looking up a nonexistent key should be a KeyError" except KeyError: pass
class SkeletonReducer: def __init__(self, sparse_skel: SkeletonType): self.reduced_to_sparse = OrderedSet( sorted(idx for line in sparse_skel.lines_flat for idx in line)) self.sparse_skel = sparse_skel self.dense_skel = SkeletonType( map_idxs(sparse_skel.lines, lambda x: self.reduced_to_sparse.index(x))) def reduce_arr(self, arr): return arr[self.reduced_to_sparse]
def make_sparse_assoc(freq_path, parallel_text_path, output_path, languages, vocab_size=100000): print("Building vocab") vocab = OrderedSet() languages.sort() for language in languages: print('\t{}'.format(language)) language_freq_path = freq_path / '{}.txt'.format(language) with language_freq_path.open(encoding='utf-8') as freq_file: for i, line in enumerate(freq_file): if i >= vocab_size: break word, _rest = line.split('\t') uri = make_short_uri(language, word) vocab.add(uri) vocab_path = output_path / 'vocab.txt' with vocab_path.open('w', encoding='utf-8') as vocab_out: for uri in vocab: print(uri, file=vocab_out) coords_path = output_path / 'coords.dat' with (output_path / 'coords.dat').open('wb') as coords_out: for lang1, lang2 in itertools.combinations(languages, 2): print(lang1, lang2) parallel_path = parallel_text_path / '{}-{}.txt'.format(lang1, lang2) with parallel_path.open(encoding='utf-8') as parallel_file: for i, line in enumerate(parallel_file): if i % 100000 == 0: print('\t{}'.format(i)) text1, text2 = line.rstrip('\n').split('\t') words1 = [make_short_uri(lang1, word) for word in text1.split()] words2 = [make_short_uri(lang2, word) for word in text2.split()] words = [uri for uri in (words1 + words2) if uri in vocab] for word1 in words: idx1 = vocab.index(word1) for word2 in words: idx2 = vocab.index(word2) coord_bytes = struct.pack('<ii', idx1, idx2) coords_out.write(coord_bytes)
def _read_SMILES(self, input_file) -> OrderedSet: """ Reads a SMILES file. Returns an ordered set of ReactionContainer objects passed the standardization protocol. :param input_file: str :return: OrderedSet """ data = OrderedSet() self.logger.info('Start..') with SMILESRead(input_file, ignore=True, store_log=True, remap=self._ignore_mapping, header=True) as ifile, \ open(input_file) as meta_searcher: id_tag_position = meta_searcher.readline().strip().split().index( self._id_tag) if id_tag_position is None or id_tag_position == 0: self.logger.critical( f'No reaction ID tag was found in the header!') raise ValueError( f'No reaction ID tag was found in the header!') for reaction in ifile._data: if isinstance(reaction, tuple): meta_searcher.seek(reaction.position) line = meta_searcher.readline().strip().split() if len(line) <= id_tag_position: self.logger.critical( f'No reaction ID tag was found in line {reaction.number}!' ) raise ValueError( f'No reaction ID tag was found in line {reaction.number}!' ) r_id = line[id_tag_position] self.logger.critical( f'Reaction {r_id}: Parser has returned an error message\n{reaction.log}' ) continue standardized_reaction = self.standardize(reaction) if standardized_reaction: if standardized_reaction not in data: data.add(standardized_reaction) else: i = data.index(standardized_reaction) if 'Extraction_IDs' not in data[i].meta: data[i].meta['Extraction_IDs'] = '' data[i].meta['Extraction_IDs'] = ','.join( data[i].meta['Extraction_IDs'].split(',') + [reaction.meta[self._id_tag]]) self.logger.info( 'Reaction {0} is a duplicate of the reaction {1}..' .format(reaction.meta[self._id_tag], data[i].meta[self._id_tag])) return data
def test_remove(): set1 = OrderedSet('abracadabra') set1.remove('a') set1.remove('b') assert set1 == OrderedSet('rcd') assert set1[0] == 'r' assert set1[1] == 'c' assert set1[2] == 'd' assert set1.index('r') == 0 assert set1.index('c') == 1 assert set1.index('d') == 2 assert 'a' not in set1 assert 'b' not in set1 assert 'r' in set1 # Make sure we can .discard() something that's already gone, plus # something that was never there set1.discard('a') set1.discard('a')
def _read_RDF(self, input_file) -> OrderedSet: """ Reads an RDF file. Returns an ordered set of ReactionContainer objects passed the standardization protocol. :param input_file: str :return: OrderedSet """ data = OrderedSet() self.logger.info('Start..') with RDFRead(input_file, ignore=self._ignore_mapping, store_log=True, remap=self._ignore_mapping) as ifile, \ open(input_file) as meta_searcher: for reaction in ifile._data: if isinstance(reaction, tuple): meta_searcher.seek(reaction.position) flag = False for line in meta_searcher: if flag and '$RFMT' in line: self.logger.critical( f'Reaction id extraction problem rised for the reaction ' f'#{reaction.number + 1}: a reaction id was expected but $RFMT line ' f'was found!') if flag: self.logger.critical( f'Reaction {line.strip().split()[1]}: Parser has returned an error ' f'message\n{reaction.log}') break elif '$RFMT' in line: self.logger.critical( f'Reaction #{reaction.number + 1} has no reaction id!' ) elif f'$DTYPE {self._id_tag}' in line: flag = True continue standardized_reaction = self.standardize(reaction) if standardized_reaction: if standardized_reaction not in data: data.add(standardized_reaction) else: i = data.index(standardized_reaction) if 'Extraction_IDs' not in data[i].meta: data[i].meta['Extraction_IDs'] = '' data[i].meta['Extraction_IDs'] = ','.join( data[i].meta['Extraction_IDs'].split(',') + [reaction.meta[self._id_tag]]) self.logger.info( 'Reaction {0} is a duplicate of the reaction {1}..' .format(reaction.meta[self._id_tag], data[i].meta[self._id_tag])) return data
def standardize_vecs(labels, vecs, merge_mode='weighted'): standardized_labels = OrderedSet() standardized_vecs = [] for index, (label, vec) in enumerate(zip(labels, vecs)): label = standardize(label) if merge_mode == 'weighted': vec /= (index + 1) if label not in standardized_labels: standardized_labels.add(label) standardized_vecs.append(vec) else: if merge_mode != 'first': index = standardized_labels.index(label) standardized_vecs[index] += vec return list(standardized_labels), np.array(standardized_vecs)
def expr_to_matrix(self, expr, row_dict, constr_query, constr_query_symbols): """ First normalizes a given expression with a visitor pattern then queries the knowledge base for the given query and assigns the results to their respective row and column index defined the the row and column dictionaries. :param expr: The expression to be grounded :type expr: Sympy Expression| RLPSum :param row_dict: An OrderedSet containing the row indices for the lp matrix for the given expression :type row_dict: OrderedSet :param constr_query: The query originating from a given constraint :type constr_query: Sympy Expression | RLPSum :param constr_query_symbols: A Set containing the query symbols for the given constraint query :type constr_query_symbols: FiniteSet :return: A dictionary containing a unique name for the variable and the results returned from the knowledge base. """ expr = Normalizer(expr).result if not isinstance(expr, Add): summands = [expr] else: summands = expr.args result = {} log.debug("\nSummands: %s", str(summands)) for summand in summands: log.debug("\n->summand: %s", str(summand)) if isinstance(summand, RlpSum): summand_query = summand.query summand_query_symbols = summand.query_symbols coef_query, coef_expr, variable = coefficient_to_query(summand.args[2]) else: summand_query = True summand_query_symbols = EmptySet() coef_query, coef_expr, variable = coefficient_to_query(summand) query_symbols = OrderedSet(constr_query_symbols + summand_query_symbols) query = constr_query & summand_query & coef_query answers = self.logkb.ask(query_symbols, query, coef_expr) variable_qs_indices = [] if variable is not None: variable_qs_indices = [query_symbols.index(arg) for arg in variable.args if isinstance(arg, SubSymbol)] constr_qs_indices = [query_symbols.index(symbol) for symbol in constr_query_symbols] variable_class = variable.__class__ col_dict = self.col_dicts.get(variable_class, OrderedSet()) self.col_dicts[variable_class] = col_dict # If the query yields no results we don't have to add anything to the matrix if len(answers) == 0: continue expr_index = len(answers[0]) - 1 sparse_data = [] for answer in answers: column_record = [] # use only subsymbols when they occur, otherwise constants qs_iterator = iter(variable_qs_indices) if variable is not None: for arg in variable.args: if isinstance(arg, SubSymbol): column_record.append(answer[qs_iterator.next()]) else: column_record.append(arg) col_dict_index = col_dict.add(tuple(column_record)) row_dict_index = row_dict.add(tuple(answer[i] for i in constr_qs_indices)) sparse_data.append([np.float(answer[expr_index]), row_dict_index, col_dict_index]) sparse_data = np.array(sparse_data) summand_block = sp.sparse.coo_matrix((sparse_data[:, 0], (sparse_data[:, 1], sparse_data[:, 2]))).todok() if variable_class in result: shape = (len(row_dict), len(col_dict)) result[variable_class].resize(shape) summand_block.resize(shape) result[variable_class] += summand_block else: result[variable_class] = summand_block return result
def test_tuples(): set1 = OrderedSet() tup = ('tuple', 1) set1.add(tup) assert set1.index(tup) == 0 assert set1[0] == tup
def test_tuples(): set1 = OrderedSet() tup = ('tuple', 1) set1.add(tup) eq_(set1.index(tup), 0) eq_(set1[0], tup)
class CategoricalDescriptor(Descriptor): """A |Descriptor| used to extract a categorical property from a collection of |Record|. Args: name (str): The |Record| property to describe name. fetch_fn (Callable): Optional. Default to identity. An optional function applied to the |Record| property before it is counted in a category. Attributes: name (str): The |Record| property to describe name. """ def __init__(self, name, fetch_fn=None): super(CategoricalDescriptor, self).__init__(name) self._categories = OrderedSet() self._fetch_fn = fetch_fn or identity def update(self, *record_collections): """Update the set of known categories from |Record| property :attr:`name` value. Args: *record_collections (|RecordCollection|): |RecordCollection| of which |Record| will be used to update set of known categories. """ records = (record for record_collection in record_collections for record in record_collection) try: for record in records: self._categories.add( str(self._fetch_fn(getattr(record, self.name)))) except AttributeError: raise ValueError( 'Invalid record property name: {} was not found in record.'. format(self.name)) def compute(self, *record_collections): """Construct new |RecordCollection| where each enclosed |Record| is added a category number as a property. Args: *record_collections (|RecordCollection|): |RecordCollection| used to construct new |RecordCollection| with described |Record|. Returns: (|RecordCollection|, ): A described |RecordCollection| tuple. """ for record_collection in record_collections: for record in record_collection: try: record.properties[self.property_name] = \ self._categories.index(str(self._fetch_fn(getattr(record, self.name)))) + 0.5 except AttributeError: raise ValueError( 'Invalid record property name: {} was not found in record.' .format(self.name)) except KeyError: raise ValueError( 'Invalid record property value: ' '{} is out of known property range of values.'.format( getattr(record, self.name))) return record_collections def reset(self): """Reset |CategoricalDescriptor| set of known categories to factory values.""" self._categories = OrderedSet() def _make_interface(self): return { 'type': 'categorical', 'schema': { category: index + 0.5 for index, category in enumerate(self._categories) } }
class BundleAdjuster: ''' Bundle Adjustment class that takes in matches (with initial estimates for rotation and focal length of each camera) and minimises the reprojection error for all matches' keypoints. ''' # w.r.t. K FOCAL_DERIVATIVE = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 0]]) # w.r.t. K PPX_DERIVATIVE = np.array([[0, 0, 1], [0, 0, 0], [0, 0, 0]]) # w.r.t. K PPY_DERIVATIVE = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 0]]) def __init__(self): print(f'BundleAdjuster intitialised') self._matches = [] self._match_count = [] self._cameras = OrderedSet() def matches(self): return self._matches def added_cameras(self): return self._cameras def add(self, match): ''' Add a match to the bundle adjuster ''' num_pointwise_matches = sum( len(match.inliers) for match in self._matches) self._match_count.append(num_pointwise_matches) self._matches.append(match) for cam in match.cams(): self._cameras.add(cam) print(f'Added match {match}') def run(self): ''' Run the bundle adjuster on the current matches to find optimal camera parameters ''' if (len(self._matches) < 1): raise ValueError( 'At least one match must be added before bundle adjustment is run' ) print(f'Running bundle adjustment...') initial_state = State() initial_state.set_initial_cameras(self._cameras) intial_residuals = self._projection_errors(initial_state) intial_error = math.sqrt(np.mean(intial_residuals**2)) print(f'Initial error: {intial_error}') print('Initial params') for param in initial_state.params: print(param) itr_count = 0 non_decrease_count = 0 best_state = initial_state best_residuals = intial_residuals best_error = intial_error while (itr_count < MAX_ITR): # print(f'[{itr_count}] Curr state: \n') # for (i, el) in enumerate(best_state.params): # print(f'\t[{i}]: {el}') J, JtJ = self._calculate_jacobian(best_state) param_update = self._get_next_update(J, JtJ, best_residuals) next_state = best_state.updatedState(param_update) next_residuals = self._projection_errors(next_state) next_error_val = math.sqrt(np.mean(next_residuals**2)) print(f'Next error: {next_error_val}') # return if (next_error_val >= best_error - 1e-3): non_decrease_count += 1 else: print('Updating state to new best state') non_decrease_count = 0 best_error = next_error_val # for i in range(len(best_state.params)): # print(f'{best_state.params[i]} -> {next_state.params[i]}') best_state = next_state best_residuals = next_residuals if (non_decrease_count > 5): break print(f'BEST ERROR {best_error}') # Update actual camera object params new_cameras = best_state.cameras for i in range(len(new_cameras)): # print(f'{self._cameras[i].R} = {new_cameras[i].R}') print(f'Final focal: {new_cameras[i].focal}') self._cameras[i].focal = new_cameras[i].focal self._cameras[i].ppx = new_cameras[i].ppx self._cameras[i].ppy = new_cameras[i].ppy self._cameras[i].R = new_cameras[i].R def _cross_product_matrix(self, x, y, z): return np.array([[0, -z, y], [z, 0, -x], [-y, x, 0]], dtype=np.float64) def _dR_dvi(self, rotation_matrix, x, y, z): ''' The derivative of the rotation with respect to each rotation parameter Returns 3 matrices (dR/dx, dR/dy, dR/dz) Calculated using https://arxiv.org/pdf/1312.0788.pdf ''' ssq_params = x * x + y * y + z * z if (ssq_params < 1e-14): return np.array([ self._cross_product_matrix(1, 0, 0), self._cross_product_matrix(0, 1, 0), self._cross_product_matrix(0, 0, 1) ]) cross_product_matrix = self._cross_product_matrix(x, y, z) ret = [ cross_product_matrix, cross_product_matrix, cross_product_matrix ] ret[0] = ret[0] * x ret[1] = ret[1] * y ret[2] = ret[2] * z I_minus_R = np.identity(3) - rotation_matrix for i in range(3): x1, y1, z1 = np.cross(np.array([x, y, z]), I_minus_R[:, i]) ret[i] += self._cross_product_matrix(x1, y1, z1) ret[i] = np.multiply(ret[i], 1 / ssq_params) ret[i] = ret[i] @ rotation_matrix return ret def _drdv(self, dhdv, h**o, hz_inv, hz_sqr_inv): return np.array([ -dhdv[0] * hz_inv + dhdv[2] * h**o[0] * hz_sqr_inv, -dhdv[1] * hz_inv + dhdv[2] * h**o[1] * hz_sqr_inv ], dtype=np.float64) def _homogeneous_coordinate_2d(self, coordinate): ''' Convert Cartesian coordinate to homogeneous coordinate ''' return np.append(coordinate, [1]) def _trans(self, transform, coordinate): if (len(coordinate) == 2): return self._trans(transform, self._homogeneous_coordinate_2d(coordinate)) elif (len(coordinate) == 3): return transform @ coordinate def _calculate_jacobian(self, state): with open('ba_test_data.txt', 'w') as f: params = state.params cameras = state.cameras f.write('Params:\n') for (i, param) in enumerate(params): f.write(f'[{i}] {param}\n') f.write('\nCameras:\n') for (i, camera) in enumerate(cameras): f.write( f'[{i}] Focal: {cameras[i].focal}, R: {cameras[i].R}\n') num_cams = len(cameras) num_pointwise_matches = sum( len(match.inliers) for match in self._matches) J = np.zeros((PARAMS_PER_POINT_MATCH * num_pointwise_matches, PARAMS_PER_CAMERA * num_cams), dtype=np.float64) JtJ = np.zeros( (PARAMS_PER_CAMERA * num_cams, PARAMS_PER_CAMERA * num_cams), dtype=np.float64) all_dRdvi = [] for i in range(len(cameras)): param_i = i * PARAMS_PER_CAMERA x, y, z = params[param_i + 3:param_i + 6] dRdvi = self._dR_dvi(cameras[i].R, x, y, z) all_dRdvi.append(dRdvi) for (i, match) in enumerate(self._matches): # print(f'------------\n') # print(f'Loop itr: {i}') match_count_idx = self._match_count[i] * 2 cam_to_idx = self._cameras.index(match.cam_to) cam_from_idx = self._cameras.index(match.cam_from) cam_to = cameras[cam_to_idx] cam_from = cameras[cam_from_idx] # print(f'from.R: {cam_from.R}') # print(f'to.R: {cam_to.R}') params_index_from = cam_from_idx * PARAMS_PER_CAMERA params_index_to = cam_to_idx * PARAMS_PER_CAMERA # print(f'params_index_from: {params_index_from}') # print(f'params_index_to: {params_index_to}') from_K = cam_from.K to_K_inv = np.linalg.pinv(cam_to.K) to_R_inv = cam_to.R.T from_R = cam_from.R d_R_from_vi = all_dRdvi[cam_from_idx] d_R_to_vi = np.copy(all_dRdvi[cam_to_idx]) d_R_to_vi_T = [m.T for m in d_R_to_vi] H_to_to_from = (from_K @ from_R) @ (to_R_inv @ to_K_inv) # print(f'H_to_to_from: {H_to_to_from}') for (pair_index, pair) in enumerate(match.inliers): to_coordinate = pair[1] h**o = self._trans(H_to_to_from, to_coordinate) hz_sqr_inv = 1 / (h**o[2]**2) hz_inv = 1 / h**o[2] d_from = np.zeros( (PARAMS_PER_CAMERA, PARAMS_PER_POINT_MATCH)) d_to = np.zeros( (PARAMS_PER_CAMERA, PARAMS_PER_POINT_MATCH)) m = from_R @ to_R_inv @ to_K_inv dot_u2 = self._trans( m, to_coordinate ) #m @ self._homogeneous_coordinate_2d(to_coordinate) d_from[0] = self._drdv( self._trans(self.FOCAL_DERIVATIVE, dot_u2), h**o, hz_inv, hz_sqr_inv) d_from[1] = self._drdv( self._trans(self.PPX_DERIVATIVE, dot_u2), h**o, hz_inv, hz_sqr_inv) d_from[2] = self._drdv( self._trans(self.PPY_DERIVATIVE, dot_u2), h**o, hz_inv, hz_sqr_inv) dot_u2 = self._trans((to_R_inv @ to_K_inv), to_coordinate) f.write(f'dot_u2: {dot_u2}\n') f.write(f'from_K: {from_K}\n') f.write(f'd_R_from_vi[0]: {d_R_from_vi[0]}\n') f.write(f'h**o: {h**o}\n') f.write(f'hz_inv: {hz_inv}\n') f.write(f'hz_sqr_inv: {hz_sqr_inv}\n') d_from[3] = self._drdv( self._trans((from_K @ d_R_from_vi[0]), dot_u2), h**o, hz_inv, hz_sqr_inv) d_from[4] = self._drdv( self._trans((from_K @ d_R_from_vi[1]), dot_u2), h**o, hz_inv, hz_sqr_inv) d_from[5] = self._drdv( self._trans((from_K @ d_R_from_vi[2]), dot_u2), h**o, hz_inv, hz_sqr_inv) m = from_K @ from_R @ to_R_inv @ to_K_inv dot_u2 = self._trans(to_K_inv, to_coordinate) * -1 # print(f'dot_u2: {dot_u2}') d_to[0] = self._drdv( self._trans((m @ self.FOCAL_DERIVATIVE), dot_u2), h**o, hz_inv, hz_sqr_inv) d_to[1] = self._drdv( self._trans((m @ self.PPX_DERIVATIVE), dot_u2), h**o, hz_inv, hz_sqr_inv) d_to[2] = self._drdv( self._trans((m @ self.PPY_DERIVATIVE), dot_u2), h**o, hz_inv, hz_sqr_inv) # d_to[1], d_to[2] = d_to[2], d_to[1] m = from_K @ from_R dot_u2 = self._trans(to_K_inv, to_coordinate) d_to[3] = self._drdv( self._trans((m @ d_R_to_vi_T[0]), dot_u2), h**o, hz_inv, hz_sqr_inv) d_to[4] = self._drdv( self._trans((m @ d_R_to_vi_T[1]), dot_u2), h**o, hz_inv, hz_sqr_inv) d_to[5] = self._drdv( self._trans((m @ d_R_to_vi_T[2]), dot_u2), h**o, hz_inv, hz_sqr_inv) # print(f'dfrom: {d_from}') # print(f'dto: {d_to}') f.write(f'dfrom: {d_from}\n') f.write(f'dto: {d_to}\n') # if (pair_index == 0): # print(f'dfrom: {d_from}') # print(f'dto: {d_to}') for param_idx in range(PARAMS_PER_CAMERA): # IS pair_index CORRECT HERE? J[match_count_idx, params_index_from + param_idx] = d_from[param_idx][0] # print(f'({match_count_idx}, {params_index_from + param_idx}) dfrom[{param_idx}].x: {d_from[param_idx][0]}') J[match_count_idx, params_index_to + param_idx] = d_to[param_idx][0] # print(f'({match_count_idx}, {params_index_to + param_idx}) dto[{param_idx}].x: {d_to[param_idx][0]}') J[match_count_idx + 1, params_index_from + param_idx] = d_from[param_idx][1] # print(f'({match_count_idx+1}, {params_index_from + param_idx}) dfrom[{param_idx}].y: {d_from[param_idx][1]}') J[match_count_idx + 1, params_index_to + param_idx] = d_to[param_idx][1] # print(f'({match_count_idx+1}, {params_index_to + param_idx}) dto[{param_idx}].y: {d_to[param_idx][1]}') f.write( f'({match_count_idx}, {params_index_from + param_idx}) dfrom[{param_idx}].x: {d_from[param_idx][0]}\n' ) f.write( f'({match_count_idx}, {params_index_to + param_idx}) dto[{param_idx}].x: {d_to[param_idx][0]}\n' ) f.write( f'({match_count_idx+1}, {params_index_from + param_idx}) dfrom[{param_idx}].y: {d_from[param_idx][1]}\n' ) f.write( f'({match_count_idx+1}, {params_index_to + param_idx}) dto[{param_idx}].y: {d_to[param_idx][1]}\n' ) for param_idx_i in range(PARAMS_PER_CAMERA): for param_idx_j in range(PARAMS_PER_CAMERA): # f.write(f'[l1] index_from: {params_index_from}, index_to: {params_index_to}, i: {param_idx_i}, j: {param_idx_j}\n') i1 = params_index_from + param_idx_i i2 = params_index_to + param_idx_j val = d_from[param_idx_i] @ d_to[param_idx_j] JtJ[i1][i2] += val JtJ[i2][i1] += val f.write(f'JtJ[{i1}][{i2}] += {val}\n') f.write(f'JtJ[{i2}][{i1}] += {val}\n') for param_idx_i in range(PARAMS_PER_CAMERA): for param_idx_j in range(param_idx_i, PARAMS_PER_CAMERA): # f.write(f'[l2] index_from: {params_index_from}, index_to: {params_index_to}, i: {param_idx_i}, j: {param_idx_j}\n') i1 = params_index_from + param_idx_i i2 = params_index_from + param_idx_j val = d_from[param_idx_i] @ d_from[param_idx_j] JtJ[i1][i2] += val f.write(f'JtJ[{i1}][{i2}] += {val}\n') if (param_idx_i != param_idx_j): JtJ[i2][i1] += val f.write(f'JtJ[{i2}][{i1}] += {val}\n') i1 = params_index_to + param_idx_i i2 = params_index_to + param_idx_j val = d_to[param_idx_i] @ d_to[param_idx_j] JtJ[i1][i2] += val f.write(f'JtJ[{i1}][{i2}] += {val}\n') if (param_idx_i != param_idx_j): JtJ[i2][i1] += val f.write(f'JtJ[{i2}][{i1}] += {val}\n') match_count_idx += 2 return J, JtJ def _transform_2d(self, H, coordinate): ''' Converts cartesian coordinate to homogeneous Project coordinate with H Convert back to cartesian ''' homogeneous_coordinate = self._homogeneous_coordinate_2d(coordinate) p = H @ homogeneous_coordinate return np.array([p[0] / p[2], p[1] / p[2]]) def _projection_errors(self, state): current_cameras = state.cameras num_pointwise_matches = sum( len(match.inliers) for match in self._matches) error = np.zeros((num_pointwise_matches * PARAMS_PER_POINT_MATCH)) count = 0 for match in self._matches: cam_from = current_cameras[self._cameras.index(match.cam_from)] cam_to = current_cameras[self._cameras.index(match.cam_to)] from_K = cam_from.K from_R = cam_from.R to_K_inv = np.linalg.pinv(cam_to.K) to_R_inv = cam_to.R.T H_to_to_from = (from_K @ from_R) @ (to_R_inv @ to_K_inv) start = count for pair in match.inliers: from_coordinate = pair[0] to_coordinate = pair[1] transformed = self._transform_2d(H_to_to_from, to_coordinate) error[count] = from_coordinate[0] - transformed[0] error[count + 1] = from_coordinate[1] - transformed[1] count += 2 print( f'Match from_{match.cam_from.image.filename} to_{match.cam_to.image.filename} error: {math.sqrt(np.mean(error[start:]**2))}' ) # print(f'projection_error ({len(error)}):\n{error}') return error def _get_next_update(self, J, JtJ, residuals): # # Regularisation l = random.normalvariate(1, 0.1) # print(f'random.normalvariate(10, 20): {random.normalvariate(10, 20)}') for i in range(len(self._cameras) * PARAMS_PER_CAMERA): if (i % PARAMS_PER_CAMERA >= 3): # TODO: Improve regularisation params (currently a bit off) JtJ[i][i] += ( 3.14 / 16) * l #random.normalvariate(10, 20) * 5000000000 else: JtJ[i][i] += ( 1500 / 10 ) * l # TODO: Use intial focal estimate #random.normalvariate(10, 20) * 5000000000 # print(f'J.T shape: {J.T.shape}') # print(f'residuals: {residuals}') # with open('test_error_residuals.txt', 'w') as f: # for r in residuals: # f.write(f'{r}\n') # openpano_JtJ = np.zeros((24,24), dtype=np.float64) # filename = 'ba_optimize.txt' # readingB = False # openpano_b = [] # with open('./match_test_data/' + filename, 'r') as fp: # for line in fp: # if re.match(r'^\(', line): # tings = [x for x in re.findall(r'\-?\d+\.?\d*e?\+?\d*', line)] # # print(float(tings[2])) # openpano_JtJ[int(tings[0])][int(tings[1])] = float(tings[2]) # elif re.match(r'b:', line): # readingB = True # elif (readingB and not re.match(r'^\s$', line)): # bVal = [x for x in re.findall(r'\-?\d+\.?\d*e?\+?\d*', line)] # # print(float(bVal[0])) # openpano_b.append(float(bVal[0])) # elif (readingB and re.match(r'^\s$', line)): # readingB = False # openpano_b = np.asarray(openpano_b, dtype=np.float64) # with open('JtJ_test.txt', 'w') as f: # print(f'JtJ.shape : {JtJ.shape}') # for i in range(JtJ.shape[0]): # for j in range(JtJ.shape[1]): # f.write(f'({i}, {j}) {JtJ[i][j]}\n') b = J.T @ residuals # with open('JtJ_test_comparison.txt', 'w') as f: # print(f'JtJ.shape : {JtJ.shape}') # for i in range(JtJ.shape[0]): # for j in range(JtJ.shape[1]): # percentDiff = ((openpano_JtJ[i][j] - JtJ[i][j]) / openpano_JtJ[i][j]) * 100 # if (abs(percentDiff) > 0.001): # f.write(f'({i}, {j}) JtJ={JtJ[i][j]}, OpenPano_JtJ={openpano_JtJ[i][j]} [Diff={percentDiff}]\n') # f.write('\nb:\n') # for (i, el) in enumerate(b): # percentDiff = ((openpano_b[i] - b[i]) / openpano_b[i]) * 100 # if (abs(percentDiff) > 0.001): # f.write(f'({i}) b={b[i]}, openpano_b={openpano_b[i]} [Diff={percentDiff}]\n') # JtJ = openpano_JtJ # b = openpano_b updates = np.linalg.solve(JtJ, b) # print('b:') # for (i, el) in enumerate(b): # print(f'\t[{i}]: {el}') # print('Updates:') # for (i, update) in enumerate(updates): # print(f'\t[{i}]: {update}') # print('Recomputed b vector:') # for (i, newB) in enumerate(JtJ@updates): # print(f'\tnewB: {newB}') # updates = [] # filename = 'ba_optimize.txt' # readingB = False # with open('./match_test_data/' + filename, 'r') as fp: # for line in fp: # if re.match(r'Update:', line): # readingB = True # elif (readingB and re.match(r'^\t+', line)): # bVal = [x for x in re.findall(r'\-?\d+\.?\d*e?\+?\d*', line)] # # print(float(bVal[0])) # updates.append(float(bVal[0])) # elif (readingB and re.match(r'^\s$', line)): # readingB = False # print('Updates') # for update in updates: # print(update) # updates = np.array(updates, dtype=np.float64) return updates
def _compute_validation_outputs(self, world: List[SpiderWorld], sub_graphs, sub_graphs_scores, sub_graphs_labels, candidates, outputs: Dict[str, Any]) -> None: batch_size = len(world) outputs['predicted_sql_query'] = [] outputs['candidates'] = [] for i in range(batch_size): if world[i].query is not None: gold_sql_query = ' '.join(world[i].query) difficulty = self._query_difficulty( query_tokens=gold_sql_query.split(), entities=set(world[i].db_context.knowledge_graph.entities)) num_candidates = self._metric_num_candidates example_sub_graphs = sub_graphs[i, :num_candidates] example_sub_graphs_scores = sub_graphs_scores[i, :num_candidates] example_candidates = candidates[i][:num_candidates] if sub_graphs_labels is not None: example_sub_graphs_labels = sub_graphs_labels[ i, :num_candidates] candidate_to_sub_graph_id = {} sub_graphs_ids = [] for sub_graph in example_sub_graphs: entities_ids = sub_graph[sub_graph > -1].tolist() if len(entities_ids) == 0: continue sub_graph = tuple(sorted(entities_ids)) if sub_graph not in candidate_to_sub_graph_id: candidate_to_sub_graph_id[sub_graph] = len( candidate_to_sub_graph_id) sub_graphs_ids.append(candidate_to_sub_graph_id[sub_graph]) sorted_candidates_ids = example_sub_graphs_scores.sort( descending=True)[1].tolist() sorted_sub_graphs = OrderedSet([ sub_graphs_ids[j] for j in sorted_candidates_ids if j < len(sub_graphs_ids) ]) candidates_for_final_sort = [] for original_rank, c in enumerate(example_candidates): sub_graph_id = sub_graphs_ids[original_rank] if sub_graphs_labels is not None: sg_correct = int( example_sub_graphs_labels[original_rank] == 1) else: sg_correct = None candidates_for_final_sort.append({ 'query': c['query'], 'original_rank': original_rank, 'reranker_sg_rank': sorted_sub_graphs.index(sub_graph_id), 'reranker_cand_rank': sorted_candidates_ids.index(original_rank), 'sub_graph_correct': sg_correct, 'correct': c['correct'] }) # sorting sub graphs, then inner-ranking by original beam search order candidates_sg_sort = sorted( candidates_for_final_sort, key=lambda x: (x['reranker_sg_rank'], x['original_rank'])) if sub_graphs_labels is not None: sg_tsk_query_correct = candidates_sg_sort[0]['correct'] self._update_metric('query_accuracy', int(sg_tsk_query_correct), difficulty) outputs['candidates'].append( [c['query'] for c in candidates_sg_sort])
for split in splits: instances_per_ig[split] = {} triples_per_ig[split] = {} start_instance_id = 0 for ig_index, ig in enumerate(data[split]): instances = OrderedSet() triples = defaultdict(int) existing_combinations = set() for t in ig: object_indices = [] for pos in [S_n, O_n]: object_ = [t[pos][category]] object_.extend([t[pos][bbox][i] for i in range(3)]) object_ = tuple(object_) instances.add(object_) object_indices.append(instances.index(object_)) existing_combinations.add(tuple(object_indices)) triples[(t[S_n][category], t[P_n], t[O_n][category], object_indices[0] + start_instance_id, object_indices[1] + start_instance_id)] += 1 # add unannotated subject-object-pairs as triples with unknown relation for sbj_index in range(len(instances)): for obj_index in range(len(instances)): if sbj_index != obj_index: if not (sbj_index, obj_index) in existing_combinations: sbj = instances[sbj_index] obj = instances[obj_index] triples[(sbj[0], relations["unknown"], obj[0], sbj_index + start_instance_id, obj_index + start_instance_id)] += 1
class WordVectors: def __init__(self, labels, vectors, replacements=None, standardizer=standardize): assert(len(labels) == len(vectors)) self.labels = OrderedSet(labels) if not isinstance(vectors, np.memmap): normalize(vectors, copy=False) self.vectors = vectors self.replacements = replacements self._standardizer = standardizer self._mean_vec = np.mean(self.vectors, axis=0) def truncate(self, size): return WordVectors( list(self.labels)[:size], self.vectors[:size], self.replacements, self._standardizer ) def similarity(self, word1, word2, lang=None): try: return self.to_vector(word1, lang).dot(self.to_vector(word2, lang)) except KeyError: return 0 def to_vector(self, word, lang=None, default_zero=False) -> np.ndarray: if isinstance(word, list): vec = np.zeros(self.vectors.shape[1]) for actual_word, weight in word: vec += self.to_vector(actual_word, lang=lang) return normalize_vec(vec) if self._standardizer is not None: if self._standardizer is standardize and \ lang is not None: word = self._standardizer(word, lang=lang) else: word = self._standardizer(word) max_sim = 1. if self.replacements and word in self.replacements: while word not in self.labels: word, sim = self.replacements[word] #max_sim *= np.sqrt(sim) if default_zero and word not in self.labels: return np.zeros(self.vectors.shape[1]) vec = normalize_vec(self.vectors[self.labels.index(word)]) return vec * max_sim def similar_to(self, word_or_vector, num=20, only=None): if isinstance(self.vectors, np.memmap): self.vectors = normalize(self.vectors) if isinstance(word_or_vector, str): vec = self.to_vector(word_or_vector) else: vec = word_or_vector sim = self.vectors.dot(vec) indices = np.argsort(sim)[::-1] out = [] for index in indices: if len(out) == num: return out if only is None or only(self.labels[index]): out.append((self.labels[index], sim[index])) return out def which_relation(self, rel_array, v1, v2): if isinstance(v1, str): v1 = self.to_vector(v1) if isinstance(v2, str): v2 = self.to_vector(v2) avg_rel = self._mean_vec.dot(rel_array.dot(self._mean_vec)) rels = v2.dot(rel_array.dot(v1)) diff = np.maximum(0, rels - avg_rel) ** 2 return diff / np.sum(diff) def analogy_values(self, rel_array, c1, c2, c3, vector_choices): # Convert the input concepts to vectors v1, v2, v3 = [self.to_vector(c, default_zero=True) for c in (c1, c2, c3)] # relA and relB are vectors whose length is the number of relations. # They indicate the relative weight with which each relation holds # between appropriate pairs of input concepts. relA = self.which_relation(rel_array, v1, v2) relB = self.which_relation(rel_array, v1, v3) # relAr and relBr are matrices that use these combinations of # relations to convert one vector into another. relAr = rank3_inner_product(relA, rel_array) relBr = rank3_inner_product(relB, rel_array) # rv1 is the vector that's related to v1 by these relations, and so on. rv1 = (relAr + relBr).dot(v1) rv2 = relBr.dot(v2) rv3 = relAr.dot(v3) ratings = weighted_3cosmul(rv1, rv2, rv3, vector_choices) return ratings def rank_analogies(self, rel_array, c1, c2, c3, only=None, num=20): ratings = self.analogy_values(rel_array, c1, c2, c3, self.vectors) indices = np.argsort(ratings)[::-1] out = [] for index in indices: if len(out) >= num: return out if only is None or only(self.labels[index]): out.append((self.labels[index], ratings[index])) return out def rate_analogy(self, rel_array, c1, c2, c3, c4): v4 = self.to_vector(c4) return self.analogy_values(rel_array, c1, c2, c3, v4)
class Graph: """Object to represent a directed graph.""" def __init__( self, nodes: Optional[Sequence[Node]] = None, edges: Optional[Sequence[Edge]] = None, A: Optional[spmatrix] = None, nodeprops: Optional[NodeProperties] = None, edgeprops: Optional[EdgeProperties] = None, ): self.A_ = dok_matrix((MAX_N_NODES, MAX_N_NODES), dtype=bool) if nodes is None: self.nodes_ = OrderedSet() if edges is not None: self.edges_ = set(edges) for x, y in edges: i = self.nodes_.add(x) j = self.nodes_.add(y) self.A_[i, j] = True elif A is not None: self.nodes_ = OrderedSet(np.arange(A.shape[0])) self.edges_ = set() for i, j in zip(*A.nonzero()): self.edges_.add((i, j)) self.A_[i, j] = True else: self.edges_ = set() else: self.nodes_ = OrderedSet(nodes) if edges is not None: self.edges_ = set(edges) for x, y in edges: i = self.nodes_.index(x) j = self.nodes_.index(y) self.A_[i, j] = True elif A is not None: self.edges_ = set() for i, j in zip(*A.nonzero()): self.edges_.add((self.nodes_[i], self.nodes_[j])) self.A_[i, j] = True else: self.edges_ = set() self.nodeprops = ifnone(nodeprops, {}) self.edgeprops = ifnone(edgeprops, {}) @property def n_nodes(self): return len(self.nodes_) @property def nodes(self): return self.nodes_ @property def edges(self): return self.edges_ @property def A(self): return self.A_.tocsr()[: self.n_nodes, : self.n_nodes] def add_node(self, node: Node): self.nodes_.add(node) def add_nodes(self, nodes: Sequence[Node]): for node in nodes: self.add_node(node) def add_edge(self, edge: Edge): self.add_nodes(edge) self.edges_.add(edge) n1, n2 = edge i = self.nodes_.index(n1) j = self.nodes_.index(n2) self.A_[i, j] = True def add_edges(self, edges: Sequence[Edge]): for edge in edges: self.add_edge(edge) def remove_edge(self, edge: Edge): try: self.edges_.remove(edge) except KeyError: print(f"Edge {edge} was not found in graph") n1, n2 = edge i = self.nodes_.index(n1) j = self.nodes_.index(n2) self.A_[i, j] = False def reset(self): self.nodes_ = OrderedSet() self.edges_ = set() self.A_ = dok_matrix((MAX_N_NODES, MAX_N_NODES), dtype=bool) self.nodeprops = {} self.edgeprops = {}
class Featurizer(object): # # Converts chorales into a matrix of feature indices. Each vector in a matrix represents a specific beat within # a chorale. Note that indices are 1-based to comply with Torch. # # Initialize with the number of scores to analyze def __init__(self, num_scores=20): self.num_scores = num_scores self.indices = {} self.features = [] self.harmonies = [] self.max_index = 0 self.original = [] # original, cleaned scores deposited here self.training_split = [] # training scores self.test_split = [] # test scores self.percentage_train = 0.8 # percentage of scores to be in the test split self.percentage_dev = 0.5 # percentage of the test set to be used a dev set self.data_dir = "raw_data/" self.output_dir = "data/" # Training examples created by featurize() self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] # Collect all scores and preprocess them @timing def gather_scores(self): from os import listdir self.original = [] for f in glob(self.data_dir + "*.xml"): self.original.append(converter.parse(f)) print "Gathered %d 4-part chorales." % len(self.original) return self.original # Analyze the chorales and determine the possible values for each feature @timing def analyze(self): self.analyzed = [] # to save time, we store the related objects to a score for featurizing # Reset feature sets self.keys = OrderedSet() self.key_modes = OrderedSet() self.times = OrderedSet() self.beats = OrderedSet() self.offset_ends = OrderedSet() self.cadence_dists = OrderedSet() self.intervals = OrderedSet() self.cadences = OrderedSet(['cadence', 'no cadence']) self.pitch = OrderedSet(range(RANGE['Soprano']['min'], RANGE['Soprano']['max'] + 1)) self.numerals = OrderedSet() # output feature self.inversions = OrderedSet() # output feature # THIS ORDER MATTERS self.features = [('key', self.keys), ('mode', self.key_modes), ('time', self.times), \ ('beatstr', self.beats), ('offset', self.offset_ends), ('cadence_dists', self.cadence_dists), \ ('cadence?', self.cadences), ('pitch', self.pitch), ('ibefore', self.intervals), \ ('iafter', self.intervals), ('numeral_prev', self.numerals), ('inv_prev', self.inversions)] for idx, score in enumerate(self.original): sys.stdout.write("Analyzing #%d \r" % (idx + 1)) sys.stdout.flush() # score-wide features S, A, T, B = getNotes(score.parts[0]), getNotes(score.parts[1]), getNotes(score.parts[2]), getNotes(score.parts[3]) assert len(S) == len(A) assert len(A) == len(T) assert len(T) == len(B) time_sig, key_sig = getTimeSignature(score.parts[0]), getKeySignature(score.parts[0]) key_obj = getKeyFromSignature(key_sig) fermata_locations = map(hasFermata, S) # Score-wide: Key (sharps, mode) and Time (num, denom) self.keys.add(feat_key(key_sig)) self.key_modes.add(key_sig.mode) self.times.add((time_sig.numerator, time_sig.denominator)) # Note-specific data for index, n in enumerate(S): # Beat strength self.beats.add(feat_beat(n)) # Offset from the end self.offset_ends.add(feat_offset_end(index, len(S))) # Distance to next cadence self.cadence_dists.add(feat_cadence_dist(n, index, fermata_locations)) # Intervals if index > 0: self.intervals.add(feat_interval(S[index - 1], S[index])) # Harmony numeral, inversion = feat_harmony(S[index], A[index], T[index], B[index], key_obj) self.numerals.add(numeral) self.inversions.add(inversion) # Store objects for featurizing self.analyzed.append((score, S, A, T, B, time_sig, key_sig, key_obj, fermata_locations)) # Add 'None' as an option for previous harmonies (i.e. to say there's no previous harmony for the first beat) self.numerals.add('None') self.inversions.add('None') # Add 'None' as an option for previous and future melodic intervals # (i.e. the first note has no previous note, so the 'interval before' is represented as 'None') self.intervals.add('None') # Set feature indices i_max = 1 for name, values in self.features: self.indices[name] = (i_max, i_max + len(values) - 1) i_max += len(values) self.max_index = i_max # record the highest index # Wrapper function for featurize_set(): @timing def featurize(self): # Create train-test split training, remaining = self.split(analyzed, self.percentage_train) dev, test = self.split(remaining, self.percentage_dev) self.X_train, self.y_train, self.X_dev, self.y_dev, self.X_test, self.y_test = [], [], [], [], [], [] # Training set for idx, score in enumerate(training): sys.stdout.write("Featurizing #%d \r" % (idx + 1)) sys.stdout.flush() X, y = self.featurize_score(score) self.X_train.append(X) self.y_train.append(y) print "Featurized training set." # Development set for idx, score in enumerate(training): sys.stdout.write("Featurizing #%d \r" % (idx + 1)) sys.stdout.flush() X, y = self.featurize_score(score) self.X_train.append(X) self.y_train.append(y) print "Featurized training set." # Test set for idx, score in enumerate(test): sys.stdout.write("Featurizing #%d \r" % (idx + 1)) sys.stdout.flush() X, y = self.featurize_score(score) self.X_test.append(X) self.y_test.append(y) print "Featurized test set." print "Training examples size: %d" % len(self.X_train) print "Test examples size: %d" % len(self.X_test) # Freeze for future use freezeObject(self.X_train, "X_train") freezeObject(self.y_train, "y_train") freezeObject(self.X_dev, "X_dev") freezeObject(self.y_dev, "y_dev") freezeObject(self.X_test, "X_test") freezeObject(self.y_test, "y_test") freezeObject(list(self.numerals), "numerals") freezeObject(list(self.inversions), "inversions") freezeObject(self.indices, "indices") # After analysis, this generates the training examples (input vectors, output vectors) # As scores are examined, the indices of output chords are generated. def featurize_score(self, score_packet): # feature vectors X, y = [], [] # unpack score objects score, S, A, T, B, time_sig, key_sig, key_obj, fermata_locations = score_packet # Create X vector and y output for index, n in enumerate(S): # Key f_key = self.keys.index(feat_key(key_sig)) + self.indices['key'][0] # Key mode f_mode = self.key_modes.index(key_sig.mode) + self.indices['mode'][0] # Time f_time = self.times.index((time_sig.numerator, time_sig.denominator)) + self.indices['time'][0] # Beat f_beat = self.beats.index(feat_beat(n)) + self.indices['beatstr'][0] # Offset end f_off_end = self.offset_ends.index(feat_offset_end(index, len(S))) + self.indices['offset'][0] # Cadence distance f_cadence_dist = self.cadence_dists.index(feat_cadence_dist(n, index, fermata_locations)) + self.indices['cadence_dists'][0] # Has cadence? f_cadence = feat_cadence(n) + self.indices['cadence?'][0] # Pitch f_pitch = self.pitch.index(feat_pitch(n)) + self.indices['pitch'][0] # Melodic interval before ibefore = feat_interval(S[index - 1], S[index]) if index > 0 else 'None' f_ibefore = self.intervals.index(ibefore) + self.indices['ibefore'][0] # Melodic interval after iafter = feat_interval(S[index], S[index + 1]) if index < len(S) - 1 else 'None' f_iafter = f_pbefore = self.intervals.index(iafter) + self.indices['iafter'][0] # Previous harmony num_prev, inv_prev = feat_harmony(S[index - 1], A[index - 1], T[index - 1], B[index - 1], key_obj) if index > 0 else ('None', 'None') f_num_prev = self.numerals.index(num_prev) + self.indices['numeral_prev'][0] f_inv_prev = self.inversions.index(inv_prev) + self.indices['inv_prev'][0] # Input vector input_vec = [f_key, f_mode, f_time, f_beat, f_off_end, f_cadence_dist, f_cadence, f_pitch, \ f_ibefore, f_iafter, f_num_prev, f_inv_prev] # Output class, 1-indexed for Torch f_num, f_prev = feat_harmony(S[index], A[index], T[index], B[index], key_obj) output_vec = [self.numerals.index(f_num) + 1, self.inversions.index(f_prev) + 1] X.append(input_vec) y.append(output_vec) return X, y # Verify that the feature indices are all in the right ranges def verify(self): print "Verifying indices..." # self.X_train, self.y_train = thawObject("X_train"), thawObject("y_train") # self.X_test, self.y_test = thawObject("X_test"), thawObject("y_test") # self.indices = thawObject('indices') # self.numerals = thawObject('numerals') # self.inversions = thawObject('inversions') inputs = self.X_train + self.X_test outputs = self.y_train + self.y_test for i, score in enumerate(inputs): s_in = score s_out = outputs[i] for j, example in enumerate(s_in): numeral, inversion = s_out[j] # Note the order here corresponds with the order in which the example features were added features = ['key', 'mode', 'time', 'beatstr', 'offset', 'cadence_dists', 'cadence?', 'pitch', 'pbefore', 'pafter'] for f_idx, feature in enumerate(features): try: assert in_range(example[f_idx], self.indices[feature][0], self.indices[feature][1]) except: pass try: assert in_range(numeral, 1, len(self.numerals)) assert in_range(inversion, 1, len(self.inversions)) except: pass # Write def write(self): print "Writing to %s..." % self.output_dir for idx, score in enumerate(self.X_train): with h5py.File(self.output_dir + "train_%d.hdf5" % idx, "w", libver='latest') as f: X_matrix = npy.matrix(score) f.create_dataset("X", X_matrix.shape, dtype='i', data=X_matrix) y_matrix = npy.matrix(self.y_train[idx]) f.create_dataset("y", y_matrix.shape, dtype='i', data=y_matrix) for idx, score in enumerate(self.X_test): with h5py.File(self.output_dir + "test_%d.hdf5" % idx, "w", libver='latest') as f: X_matrix = npy.matrix(score) f.create_dataset("X", X_matrix.shape, dtype='i', data=X_matrix) y_matrix = npy.matrix(self.y_test[idx]) f.create_dataset("y", y_matrix.shape, dtype='i', data=y_matrix) # Freeze features for evaluation later on freezeObject(self.harmonies, "harmonies") # Split a list into two sets, with a ratio of pg : 1 - pg (where 0 <= pg <= 1) def split(self, lst, pg): shuffle(lst) split_point = int(len(lst) * pg) set1 = score_list[split_point:] set2 = score_list[:split_point] # Make sure there is no overlap for s in self.training_split: for t in self.test_split: assert s != t return set1, set2 def run(self): self.gather_scores() self.analyze() self.featurize() self.verify() self.write() def __str__(self): s = "\n---------- FEATURIZER RESULTS ----------\n" for name, values in self.features: s += name + ": " + str(values) + "\n" s += "\n" s += "Indices:\n" for name, values in self.features: s+= "'%s': %s\n" % (name, str(self.indices[name])) s += "\n" s += "Roman numerals (%d total)\n%s\n" % (len(self.numerals), self.numerals) s += "\n" s += "Inversions (%d total)\n%s\n" % (len(self.inversions), self.inversions) s += "\n" s += "Test-training split: %d training chorales, %d test chorales\n" % (len(self.training_split), len(self.test_split)) s += "Test-training examples: %d for training, %d for test\n" % (len(self.X_train), len(self.X_test)) s += "---------------------------------------\n" return s __repr__ = __str__