def crawl_from_base(base_link, num_of_links): links = OrderedSet() link_index = 0 while len(links) < num_of_links: # Get website data and setup parser html_data = requests.get(base_link) html_parser = BeautifulSoup(html_data.text, 'html.parser') # Get <a> tags a_tags = html_parser.find_all('a') # Get href links that start with "http" for a_tag in a_tags: if len(links) >= num_of_links: return links elif a_tag.get('href', '') != '' and re.search( "^http", a_tag['href']): links.add(a_tag['href']) # Update index to check next page link_index += 1 base_link = links[link_index] return links
def loadCache(): tracksSet = OrderedSet() with open("library_cache.json", "r") as f: tracks = json.load(f) for trackDict in tracks["tracks"]: tracksSet.add(Track.fromDict(trackDict)) return tracksSet
def find_similar(topic, title, tags): print('DEBUG IN FIND SIMILAR: {}, {}, {}\n'.format(topic, title, tags)) if topic not in valid_sites: raise Exception('Unsupported topic') method = 'search/advanced' SITE = StackAPI(topic, key=APP_KEY, access_token=ACCESS_TOKEN) similar = [] similar += SITE.fetch( method, q=title, tags=';'.join(tags), answers=1, sort='votes')['items'] # title match and 1+ tags match similar += SITE.fetch(method, q=title, answers=1, store_new_question='votes')['items'] # title match #similar += SITE.fetch(method, tags=';'.join(tags), answers=1, sort='votes')['items'] # 1+ tags match ids = OrderedSet() for s in similar: ids.add(str(s['question_id'])) ids = list(ids)[:15] # Top 15 print('{} SIMILAR FOUND\n'.format(len(ids))) return get_questions_and_answers(topic, ids)
class Circle(Shape): radius = 0 def __init__(self, midpoint, radius): super().__init__() self.midpoint = midpoint self.radius = int(radius) @classmethod def from_rect(cls, rect): diameter = min(rect.width, rect.height) radius = int(diameter/2) midpoint = rect.midpoint return Circle(midpoint, radius) def find_points(self): midx, midy = self.midpoint self._points = OrderedSet() for x in range(-1*self.radius, self.radius+1): for y in range(-1*self.radius, self.radius+1): if self.contains_point((int(x), int(y))): self._points.add((int(x+midx), int(y+midy))) def contains_point(self, p): x, y = p return (x+0.5)**2 + (y+0.5)**2 <= self.radius**2
def extract_phrases_without_new_chars(): chars = get_chars('../output/TRADITIONAL_CHARS.TXT') phrases = extract_phrases('../ex-src/cj5-ftzk_utf-8.txt') phrases_without_new_chars = OrderedSet() phrases_with_new_chars = OrderedSet() for phrase in phrases: has_new = False for char in phrase: if char not in chars: has_new = True break if has_new: phrases_with_new_chars.add(phrase) else: phrases_without_new_chars.add(phrase) print('total phrases = {}'.format(len(phrases))) print('phrases without new chars = {}'.format(len(phrases_without_new_chars))) print('phrases with new chars = {}'.format(len(phrases_with_new_chars))) f = open('../output/TRADITIONAL_PHRASES.TXT', 'w') for p in phrases_without_new_chars: f.write('%s\n' % p) f.close() print('Phrases with new chars:') for p in phrases_with_new_chars: print(p)
def collect_all_phrases(path): chars_no_data = OrderedSet() char_phrases = [] f = open(path) lines = f.readlines() f.close() for line in lines: char = line.strip() # no data file if not os.path.exists('../output/char_data/'+char+'.html'): chars_no_data.add(char) else: df = open('../output/char_data/'+char+'.html') content = df.read() df.close() if 'html' not in content: chars_no_data.add(char) else: phrases = collect_phrases(content) char_phrases.append(json.dumps({'char':char, 'phrases': phrases}, ensure_ascii=False)) # write chars with pharases print('total chars with phrases: {}'.format(len(char_phrases))) fo = open('../output/ONLINE_CHAR_PHRASES.TXT', 'a') for cp in char_phrases: fo.write("%s\n" % cp) fo.close() # save remaining print('characters without data: '.format(len(chars_no_data))) for char in chars_no_data: print(char)
class IncludeRequest(Request): """ Adds the ability to include webassets bundles on the request. If the bundle does not exist, a KeyError will be raised during the rendering of the response, after the view has returned. Including a bundle multiple times will have the same result as including it once. The bundles are rendered in the order in which they were included. Bundles that are included first, are also rendered first. For example: @App.html(model=Model) def view(self, request): request.include('jquery') # includes the jquery bundle """ def __init__(self, *args, **kwargs): super(IncludeRequest, self).__init__(*args, **kwargs) self.included_assets = OrderedSet() def include(self, resource): self.included_assets.add(resource)
def remove_stopwords(text): clean_text = OrderedSet() for i in text: i = re.sub('\W','',i) if(i not in stop_words): clean_text.add(i) return clean_text
def evidence_writer(filtered_evidence, sentence_id, data_source, resource_v, top_k, predicate, set_up, rule_predicates): data_source = data_source + '/' + set_up # rule_predicates = get_rule_predicates(data_source, top_k, predicate) # print rule_predicates item_set = OrderedSet() print resource_v, predicate for evidence in filtered_evidence: if evidence[1] in rule_predicates: if evidence[0] == resource_v[0] and evidence[2] == resource_v[ 1] and evidence[1] == predicate: pass else: try: item_set.add(evidence[1] + '("' + evidence[0] + '","' + evidence[2] + '").') except: pass with open( 'LPmln/' + data_source + '/evidence_' + top_k + '/' + sentence_id + predicate + '.txt', 'wb') as csvfile: for i in item_set: if '*' not in i: try: csvfile.write(i + '\n') except: pass with open('LPmln/' + data_source + '/evidence_'+top_k+'/' + sentence_id + predicate + '.txt', 'r') as f, \ open('LPmln/' + data_source + '/evidence_'+top_k+'/' + sentence_id + predicate + '_unique.txt', 'wb') as\ out_file: out_file.writelines(unique_everseen(f)) remove_file = 'LPmln/' + data_source + '/evidence_' + top_k + '/' + sentence_id + predicate + '.txt' os.remove(remove_file) return item_set
def build_clr_states(self): self.canonical_collection = [] start = OrderedSet( [Item(self.grammar.rules[0], 0, set([DomainTag.END_OF_TEXT]))]) self.canonical_collection.append(State(self.grammar, start)) i = 0 while i < len(self.canonical_collection): swd = OrderedSet() for item in self.canonical_collection[i].items: if item.get_current() != None: swd.add(item.get_current()) for s in swd: next_state_items = OrderedSet() for item in self.canonical_collection[i].items: if item.get_current() != None and item.get_current() == s: temp = Item(item.rule, item.marker + 1, item.lookahead) next_state_items.add(temp) next_state = State(self.grammar, next_state_items) exists = False for j in range(len(self.canonical_collection)): if self.canonical_collection[j].items == next_state.items: exists = True self.canonical_collection[i].transition[ s] = self.canonical_collection[j] if not exists: self.canonical_collection.append(next_state) self.canonical_collection[i].transition[s] = next_state i += 1
def evidence_writer(evidences, sentence_id, data_source, resource_v, rule_predicates): item_set = OrderedSet() for evidence in evidences: if evidence[1] in rule_predicates: if evidence[0] == resource_v[0] and evidence[2] == resource_v[1] and evidence[1] == data_source: pass else: try: if '"' not in evidence[0] and '"' not in evidence[2]: if ':' not in evidence[0] and ':' not in evidence[2]: if '#' not in evidence[0] and '#' not in evidence[2]: item_set.add(evidence[1] + '("' + evidence[0] + '","' + evidence[2] + '").') except: pass with open('dataset/' + data_source + '/evidence/'+dbpedia + '/' + rule_mining + '/' + str(sentence_id)+'_.txt', 'wb') as csvfile: for i in item_set: if '*' not in i: try: print i csvfile.write(i+'\n') except: pass with open('dataset/' + data_source + '/evidence/'+dbpedia + '/'+ rule_mining + '/' + str(sentence_id)+'_.txt', 'r') as f, \ open('dataset/' + data_source + '/evidence/'+ dbpedia + '/' + rule_mining + '/' + str(sentence_id) + '_unique.txt', 'wb') as out_file: out_file.writelines(unique_everseen(f)) remove_file = 'dataset/' + data_source + '/evidence/'+ dbpedia + '/' + rule_mining + '/' + str(sentence_id)+'_.txt' os.remove(remove_file) return item_set
class ShellCommandSerializer(object, Resetable): def __init__(self, name): """ Parameters ---------- name : str shell_command_args : OrderedSet<(str, str, str)> shell_commands : set<str> """ self.name = name self.reset() def reset(self): self.shell_command_args = OrderedSet() self.__uniq_shell_commands = set() def get_shell_commands(self): return [x[1] for x in self.shell_command_args] def add_command(self, id, cmd, prefixes): prefixes = tuple(prefixes) if cmd not in self.__uniq_shell_commands: shell_command_arg = (id, cmd, prefixes) self.__uniq_shell_commands.add(cmd) self.shell_command_args.add(shell_command_arg) def run_commands(self, max_workers=None): if max_workers is None: max_workers = cpu_count() with futures.ThreadPoolExecutor(max_workers=max_workers) as executor: executor.map(lambda x: singletons.self.shell_helper.run_shell(*x), self.shell_command_args)
def wrapper(*args): arg = args[0] nonlocal deferring # stateful! if arg == SAFEWORD: deferring = not deferring if deferring: # nothing else to do this turn return # we're not deferring and we have stored calls to process if not call_queue.empty(): # input order cannot be guaranteed as we're using multiprocessing # single-process input order can be guaranteed calls = OrderedSet() while not call_queue.empty(): calls.add(call_queue.get()) LOG.info("%s notifications to be sent call", len(calls)) return [fn(*fnargs) for fnargs in calls] else: # we're not deferring and we have no calls to process # TODO: empty list or None ? return # store the args if we're deferring and return if deferring: call_queue.put(args) return # we're not deferring, call wrapped fn as normal return fn(*args)
class Node: def __init__(self, x: int, y: int, width: int): self.x = x self.y = y self.width = width self.__neighbors = OrderedSet() self.__conn_ins = [] self.__edge_cost = {} def add_edge(self, node: "Node", delay: int = 0, force_connect: bool = False): if not force_connect: assert self.width == node.width if node not in self.__neighbors: self.__neighbors.add(node) node.__conn_ins.append(self) self.__edge_cost[node] = delay def remove_edge(self, node: "Node"): if node in self.__neighbors: self.__edge_cost.pop(node) self.__neighbors.remove(node) # remove the incoming connections as well node.__conn_ins.remove(self) def get_edge_cost(self, node: "Node") -> int: if node not in self.__edge_cost: return MAX_DEFAULT_DELAY else: return self.__edge_cost[node] def get_conn_in(self) -> List["Node"]: return self.__conn_ins def __iter__(self) -> Iterator["Node"]: return iter(self.__neighbors) def __len__(self): return len(self.__neighbors) @abstractmethod def __repr__(self): pass @abstractmethod def node_str(self): pass def clear(self): self.__neighbors.clear() self.__edge_cost.clear() self.__conn_ins.clear() def __contains__(self, item): return item in self.__neighbors def __hash__(self): return hash(self.width) ^ hash(self.x) ^ hash(self.y)
def get_ignore_types_in_groups(self, ignore_type_in_groups, ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses): if ignore_type_in_groups: if isinstance(ignore_type_in_groups[0], type): ignore_type_in_groups = [ignore_type_in_groups] else: ignore_type_in_groups = [] result = [] for item_group in ignore_type_in_groups: new_item_group = OrderedSet() for item in item_group: item = type(item) if item is None or not isinstance( item, type) else item new_item_group.add(item) result.append(new_item_group) ignore_type_in_groups = result if ignore_string_type_changes and self.strings not in ignore_type_in_groups: ignore_type_in_groups.append(OrderedSet(self.strings)) if ignore_numeric_type_changes and self.numbers not in ignore_type_in_groups: ignore_type_in_groups.append(OrderedSet(self.numbers)) if ignore_type_subclasses: ignore_type_in_groups = list(map(tuple, ignore_type_in_groups)) return ignore_type_in_groups
def deleteTracks(tracks): deletedTracks = OrderedSet() if not ("--delete" in sys.argv or "-d" in sys.argv) or len(tracks) == 0: return deletedTracks print("Will delete " + str(len(tracks)) + " songs from Youtube Music") confirmAll = False try: if sys.stdout.isatty(): confirmAll = confirm("Confirm all (Y) or one by one (N)?") else: confirmAll = True for track in tracks: print("Delete " + str(track.artist) + " - " + str(track.title) + " [" + str(track.album) + "]", end="") if confirmAll or confirm("?"): if confirmAll: print() if track.entityId: ytmusic.delete_upload_entity(track.entityId) deletedTracks.add(track) else: print( "No entity id for this. You may want to rebuild cache (-rc)" ) except: pass return deletedTracks
class FileScrubber(): excluded_set = {'[Chorus]', '[Chorus:]'} min_string_token_count = 2 def __init__(self, file_name): self.file_name = file_name self.lines_set = OrderedSet() def scrub_file(self): with open(self.file_name, 'r') as file_handler: line_count = 0 for line in file_handler: line_count += 1 if not line or line in self.excluded_set or len(line.split()) < self.min_string_token_count: continue line = line.strip(' ') if not line.endswith(',\n') and line_count % 3 == 0: line = line.replace('\n', '.\n') else: line = line.replace('\n', ' ') self.lines_set.add(line) with open('../scrubbed_file.txt', 'w') as file_handler: for item in self.lines_set: file_handler.write('{}'.format(item))
def resume(self, run, input): """ Resumes an existing run with new input :param run: the previous run state :param input: the new input :return: the updated run state """ if run.state == RunState.State.COMPLETED: raise FlowRunException("Cannot resume a completed run state") last_step = run.steps[-1] if len(run.steps) > 0 else None # reset steps list so that it doesn't grow forever in a never-ending flow run.steps = [] if last_step: current_node = last_step.node # we're resuming an existing run else: current_node = run.flow.entry # we're starting a new run if not current_node: raise FlowRunException("Flow has no entry point") # tracks nodes visited so we can detect loops nodes_visited = OrderedSet() while current_node: # if we're resuming a previously paused step, then use its arrived on value if last_step and len(nodes_visited) == 0: arrived_on = last_step.arrived_on else: arrived_on = datetime.datetime.now(tz=pytz.UTC) # create new step for this node step = Step(current_node, arrived_on) run.steps.append(step) # should we pause at this node? if isinstance(current_node, RuleSet): if current_node.is_pause() and (not input or input.consumed): run.state = RunState.State.WAIT_MESSAGE return run # check for an non-pausing loop if current_node in nodes_visited: raise FlowLoopException(nodes_visited) else: nodes_visited.add(current_node) next_node = current_node.visit(self, run, step, input) if next_node: # if we have a next node, then record leaving this one step.left_on = datetime.datetime.now(tz=pytz.UTC) else: # if not then we've completed this flow run.state = RunState.State.COMPLETED current_node = next_node return run
def _order_node_columns(cols: Set) -> OrderedSet: """ Arrange node columns in a defined order. Parameters ---------- cols: Set A set with elements in any order Returns ------- OrderedSet A set with elements in a defined order """ node_columns = cols.copy() core_columns = OrderedSet([ "id", "category", "name", "description", "xref", "provided_by", "synonym" ]) ordered_columns = OrderedSet() for c in core_columns: if c in node_columns: ordered_columns.add(c) node_columns.remove(c) internal_columns = set() remaining_columns = node_columns.copy() for c in node_columns: if c.startswith("_"): internal_columns.add(c) remaining_columns.remove(c) ordered_columns.update(sorted(remaining_columns)) ordered_columns.update(sorted(internal_columns)) return ordered_columns
def children(self): children = OrderedSet() for ports in self.wires: for port in ports: if port.owner() == self: continue children.add(port.owner()) return children
def get_phrases_from_ext_dict(path): phrases = OrderedSet() f = open(path) lines = f.readlines() f.close() for line in lines: phrases.add(line.strip()) return phrases
def days(self): days = OrderedSet() for service in self.services.available.values(): for day in service.days: days.add(day) return days
def hole_4d(): hyperplanes = OrderedSet() a1 = np.array([1, 0, 0, 0]) a2 = np.array([-1, 0, 0, 0]) a3 = np.array([0, 1, 0, 0]) a4 = np.array([0, -1, 0, 0]) a5 = np.array([0, 0, 1, 0]) a6 = np.array([0, 0, -1, 0]) a7 = np.array([0, 0, 0, 1]) a8 = np.array([0, 0, 0, -1]) p0 = Hyperplane(a5, -1) p1 = Hyperplane(a6, 0) p2 = Hyperplane(a7, -1) p3 = Hyperplane(a8, 0) p4 = Hyperplane(a1, 0) p5 = Hyperplane(a2, -1) p6 = Hyperplane(a3, -1) p7 = Hyperplane(a4, 0) P1 = set() for h in [p0, p1, p2, p3, p4, p5, p6, p7]: index = hyperplanes.add(h) P1.add((index, -1)) q1 = Hyperplane(a1, -1) q2 = Hyperplane(a2, 0) q3 = Hyperplane(a3, 0) q4 = Hyperplane(a4, -1) P2 = set() for h in [p0, p1, p2, p3, q1, q2, q3, q4]: index = hyperplanes.add(h) P2.add((index, -1)) r1 = Hyperplane(a1, -2) r2 = Hyperplane(a2, 1) r3 = Hyperplane(a3, -1) r4 = Hyperplane(a4, 0) P3 = set() for h in [p0, p1, p2, p3, r1, r2, r3, r4]: index = hyperplanes.add(h) P3.add((index, -1)) s1 = Hyperplane(a1, -1) s2 = Hyperplane(a2, 0) s3 = Hyperplane(a3, -2) s4 = Hyperplane(a4, 1) P4 = set() for h in [p0, p1, p2, p3, s1, s2, s3, s4]: index = hyperplanes.add(h) P4.add((index, -1)) return Cell_Decomposition(hyperplanes, [P1, P2, P3, P4])
class Index: def __init__(self, simple_url=constants.PYPI_SIMPLE_URL, package_url=constants.PYPI_PACKAGE_URL): self.package_url = package_url self.simple_url = simple_url self._package_names = None @property def package_names(self): if self._package_names is None: self._package_names = OrderedSet() self.reload() return self._package_names def _get_html_data(self): if self.simple_url.startswith('/'): with open(self.simple_url) as fp: data = fp.read() else: response = requests.get(self.simple_url) data = response.content return data def _get_names(self): data = self._get_html_data() soup = BeautifulSoup(data, 'html.parser') links = soup.find_all('a') names = (link.string for link in links) return names def _add_package_names(self, names): if self._package_names is None: self._package_names = OrderedSet() for name in names: self._package_names.add(name) def reload(self): """ Reload package names from index. """ names = self._get_names() self._add_package_names(names) def __len__(self): if self._package_names is None: return 0 return len(self.package_names) def __iter__(self): return (Package(name, self) for name in self.package_names) def __repr__(self): return "<Index '{}'>".format(self.simple_url)
class ClassificationResult(ABC): """ The base class for classification problem result. """ def __init__(self): self.resultdict = dict() self.metric_set = OrderedSet() self.label_set = OrderedSet() self.confusion_matrices = dict() def update_result(self, metric: str, label: str, value: float): """ update the result based on metric name and class label (for each class) Args: metric (str): metric name, e.g. `accuracy`, `recall` label (str): class label name value (float): metric value Returns: """ if metric not in self.resultdict.keys(): self.resultdict[metric] = dict() self.metric_set.add(metric) self.resultdict[metric][label] = value self.label_set.add(label) @abstractmethod def load_results_from_meta(self, evaluation_result: dict, labels: List[str] = None): raise NotImplementedError('The derived class should implement it.') @abstractmethod def convert_metrics_to_table( self) -> List[Tuple[str, List[str], List[List[float]]]]: """ converts the metrics saved in the object to a table that is ready to render in the report. Returns: a set of tables (title, header, values) """ raise NotImplementedError('The derived class should implement it.') def get_metric_list(self): """ returns all the metric names Returns: a list of metric names """ return list(self.metric_set) def get_label_list(self): """ returns all the class names Returns: a list of class label names """ return list(self.label_set)
def ask_for_nodes(self): nodes_list = requests.get("http://{}/node".format(RAS_IP)) nodes_list = json.loads(nodes_list.content) self.boot_node_list = nodes_list m_set = OrderedSet() for node in nodes_list: m_set.add(node.get('publicKey')) m_set = sorted(m_set) self.mining_nodes_list = m_set
class Shape(object): def __init__(self): self._points = OrderedSet() self._outline = OrderedSet() self._border = OrderedSet() self.dirty = True self.midpoint = (0, 0) def refresh(self): self.find_points() self.find_outline() self.find_border() self.dirty = False @property def outline(self): """The points outside the shape that are adjacent to it""" if self.dirty: self.refresh() return self._outline @property def border(self): """the points inside the shape along the border""" if self.dirty: self.refresh() return self._border @property def points(self): if self.dirty: self.refresh() self.dirty = False return self._points def find_points(self): raise NotImplementedError() def find_outline(self): self._outline = OrderedSet() for point in self._points: for neighbor in neighbors(point): if neighbor not in self._points: self._outline.add(neighbor) def find_border(self): self._border = OrderedSet() for point in self._points: for neighbor in neighbors(point): if neighbor not in self._points: self._border.add(point)
class Routes: def __init__(self): self.routes = {} self.route_number = 0 self.route_list = OrderedSet() self.route = '' @property def _get_route_list(self): return self.route_list @property def _get_routes(self): return self.routes def _add(self, point): self.route_list.add(point) self.route = '-'.join(self.route_list) def _remove(self, point): self.route_list.remove(point) def _set_route(self): def _route_cost(_set, cost=0): _set = list(_set) for i in range(len(_set) - 1): if any(_set[i + 1] == k for k in nodes[_set[i]].keys()): cost += nodes[_set[i]][_set[i + 1]] return cost if len(self.route) > 1: result = self.route if result not in self.routes.values(): self.route_number += 1 cost = _route_cost(self.route_list) self.routes[self.route_number] = result, cost self.route_list = self.route_list[:-1] self.route = '' @staticmethod def cheapest_route(routes): cost = tuple() if routes: values = routes.values() value_iter = iter(values) cost = next(value_iter) c = cost[1] for i in routes.keys(): if c > routes[i][1]: c = routes[i][1] cost[0] = routes.get(i)[0] return f'cheapest route is: {cost}' return f'There is not routes...' def __str__(self): return str(self.routes)
def extract_new_chars_from_phrases(phrases): old_chars = get_chars('../output/TRADITIONAL_CHARS.TXT') new_chars = OrderedSet() for phrase in phrases: for char in phrase: if char not in old_chars: new_chars.add(char) # write new chars for c in new_chars: print(c)
def get_phrases_from_hvdict(path): phrases = OrderedSet() f = open(path) lines = f.readlines() f.close() for line in lines: d = json.loads(line.strip()) for phrase in d['phrases']: phrases.add(phrase) return phrases
def get_chars(path): # check chars = OrderedSet() f = open(path) lines = f.readlines() f.close() for line in lines: for char in line.strip(): chars.add(char) return chars
def get_data(path): res = OrderedSet() f = open(path) for line in f: if ' ' in line: res.add(line.split()[1].strip().replace(',', '').replace('.', '')) else: res.add(line.strip().replace(',', '').replace('.', '')) f.close return res
def check_time_units(self): ret = OrderedSet() for name in self.wb.sheetnames: if self.is_rnum_sheet(name): ret.add(self.get_time_unit(self.wb[name])) if len(ret) > 1: print("ERROR: Multiple Time unit are preset in " + self.file + " " + str(ret)) return False print("Time Unit : " + ''.join(ret)) return True
class Crawler(): def __init__(self, url, depth=25): self.crawled_urls = OrderedSet([]) if (is_url_valid(url)): url = get_clean_url(url, '') self.depth = depth self.index = 0 self.crawled_urls.add(url) self.crawl(url) def crawl(self, url): ''' Crawl over URLs - scrape for anchor tags with hrefs in a webpage - reject if unwanted or cleanup the obtained links - append to a set to remove duplicates - "crawled_urls" is the repository for crawled URLs @input: url: URL to be scraped ''' found_urls = [] try: page = urlopen(url) content = page.read() soup = BeautifulSoup(content, 'lxml', parse_only=SoupStrainer('a')) for anchor in soup.find_all('a'): link = anchor.get('href') if is_url_valid(link): # Complete relative URLs link = get_clean_url(url, link) if is_link_internal(link, url): found_urls.append(link) else: pass except HTTPError as e: print('HTTPError:' + str(e.code) + ' in ', url) except URLError as e: print('URLError: ' + str(e.reason) + ' in ', url) except Exception: import traceback print('Generic exception: ' + traceback.format_exc() + ' in ', url) cleaned_found_urls = set(found_urls) # To remove repitions self.crawled_urls |= cleaned_found_urls # Union of sets if (len(self.crawled_urls) > self.depth): self.crawled_urls = self.crawled_urls[:self.depth] return else: self.index += 1 if self.index < len(self.crawled_urls): self.crawl(self.crawled_urls[self.index]) else: return
def solve(ring, values): for value in values: soln = OrderedSet([value]) target = value + ring[0] + ring[1] for ridx in range(1, len(ring)): diff = target - (ring[ridx] + ring[ridx + 1]) # TODO: We could short-circuit here if diff is not in # values, but I like this flow better if diff in values: soln.add(diff) if len(soln) == len(values): return ring, soln return None
def sub_questions(question, flows): questions = OrderedSet() try: qflows = flows.filter(question=question).exclude(next_question=question) if qflows: for flow in qflows: if flow.next_question: questions.add(flow.next_question) subsequent = sub_questions(flow.next_question, flows) map(lambda q: questions.add(q), subsequent) except QuestionFlow.DoesNotExist: return OrderedSet() return questions
def add (self, pkgList): """Given a list of lines from the input file, strip off any leading symbols and add the result to the appropriate list. """ existingExcludedSet = OrderedSet(self.excludedList) existingPackageSet = OrderedSet(self.packageList) newExcludedSet = OrderedSet() newPackageSet = OrderedSet() excludedGroupList = [] for pkg in pkgList: stripped = pkg.strip() if stripped[0:2] == "@^": self.environment = stripped[2:] elif stripped[0] == "@": self._processGroup(stripped[1:]) elif stripped[0] == "-": if stripped[1:3] == "@^" and self.environment == stripped[3:]: self.environment = None elif stripped[1] == "@": excludedGroupList.append(Group(name=stripped[2:])) else: newExcludedSet.add(stripped[1:]) else: newPackageSet.add(stripped) # Groups have to be excluded in two different ways (note: can't use # sets here because we have to store objects): excludedGroupNames = [g.name for g in excludedGroupList] # First, an excluded group may be cancelling out a previously given # one. This is often the case when using %include. So there we should # just remove the group from the list. self.groupList = [g for g in self.groupList if g.name not in excludedGroupNames] # Second, the package list could have included globs which are not # processed by pykickstart. In that case we need to preserve a list of # excluded groups so whatever tool doing package/group installation can # take appropriate action. self.excludedGroupList.extend(excludedGroupList) existingPackageSet = (existingPackageSet - newExcludedSet) | newPackageSet existingExcludedSet = (existingExcludedSet - existingPackageSet) | newExcludedSet # FIXME: figure these types out self.packageList = list(existingPackageSet) self.excludedList = list(existingExcludedSet)
def standardize_vecs(labels, vecs, merge_mode='weighted'): standardized_labels = OrderedSet() standardized_vecs = [] for index, (label, vec) in enumerate(zip(labels, vecs)): label = standardize(label) if merge_mode == 'weighted': vec /= (index + 1) if label not in standardized_labels: standardized_labels.add(label) standardized_vecs.append(vec) else: if merge_mode != 'first': index = standardized_labels.index(label) standardized_vecs[index] += vec return list(standardized_labels), np.array(standardized_vecs)
def write_wide_format_otu_table(**kwargs): output_table_io = kwargs.pop('output_table_io') table_collection = kwargs.pop('table_collection') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) if hasattr(output_table_io, 'name'): logging.info("Writing %s" % output_table_io.name) else: logging.info("Writing an OTU table") # Collect a hash of sequence to sample to num_seqs gene_to_seq_to_sample_to_count = OrderedDict() sequence_to_taxonomy = {} samples = OrderedSet() for otu in table_collection: if otu.marker not in gene_to_seq_to_sample_to_count: gene_to_seq_to_sample_to_count[otu.marker] = {} if otu.sequence not in gene_to_seq_to_sample_to_count[otu.marker]: gene_to_seq_to_sample_to_count[otu.marker][otu.sequence] = {} if otu.sample_name in gene_to_seq_to_sample_to_count[otu.marker][otu.sequence]: raise Exception("Unexpectedly found 2 of the same sequences for the same sample and marker") gene_to_seq_to_sample_to_count[otu.marker][otu.sequence][otu.sample_name] = otu.count samples.add(otu.sample_name) # This isn't perfect, because the same sequence might have # different taxonomies in different samples. But taxonomy might # be of regular form, or as a diamond example etc, so eh. sequence_to_taxonomy[otu.sequence] = otu.taxonomy output_table_io.write("\t".join(itertools.chain( # header ['marker','sequence'], samples, ['taxonomy\n']))) for gene, seq_to_sample_to_count in gene_to_seq_to_sample_to_count.items(): for seq, sample_to_count in seq_to_sample_to_count.items(): row = [gene, seq] for sample in samples: try: row.append(str(sample_to_count[sample])) except KeyError: row.append('0') row.append(sequence_to_taxonomy[seq]) output_table_io.write("\t".join(row)+"\n")
def all_questions(self): """This is might be different from the flow questions because it might have group paramater questions if present :return: """ if self.parameter_list: questions = OrderedSet(self.parameter_list.parameters) else: questions = OrderedSet() map(lambda q: questions.add(q), self.flow_questions) return questions
class SparseEntryStorage(object): """ Temporarily stores entries of a labeled sparse matrix in an efficient format. """ def __init__(self): self.reset() def reset(self): """ Resets this SparseEntryStorage to being empty. """ self.labels = OrderedSet() self.entries = defaultdict(float) def add_entry(self, entry): """ Add a single triple of the form (value, row_label, col_label). """ value, row_label, col_label = entry key = (self.labels.add(row_label), self.labels.add(col_label)) self.entries[key] += value def add_entries(self, entries): """ Add triples of the form (value, row_label, col_label). """ for value, row_label, col_label in entries: key = (self.labels.add(row_label), self.labels.add(col_label)) self.entries[key] += value def labels_and_matrix(self): """ Return the labels and symmetrized sparse matrix. """ # Borrowed from scipy.sparse.dok_matrix.tocoo() data = np.asarray(self.entries.values(), dtype='d') indices = np.asarray(self.entries.keys(), dtype=np.intc).T labels = self.labels matrix = coo_matrix((data, indices), shape=(len(labels), len(labels))) return labels, matrix + matrix.T
def select_averaged_rows(self, row_dict): """ Given a mapping from labels to row-indices, returns a space in which the row with a given label is the average of those row-indices. """ labels = OrderedSet() new_u = np.zeros((len(row_dict), self.k)) for label, indices in row_dict.items(): rownum = labels.add(label) old_rows = self.u[indices, :] new_u[rownum] = sum(old_rows) / len(old_rows) return self.__class__(new_u, self.sigma, labels)
def survey_questions(self): inline_ques = self.questions_inline() questions = OrderedSet(inline_ques) survey_questions = OrderedSet() other_flows = QuestionFlow.objects.exclude(validation_test__isnull=True, question__pk__in=[q.pk for q in inline_ques]).exclude( next_question__pk__in=[q.pk for q in inline_ques] #skip questions ) for ques in inline_ques: survey_questions.append(ques) map(lambda q: survey_questions.add(q), sub_questions(ques, other_flows)) return survey_questions
def ask(self, query_symbols, logical_query, coeff_expr=None): """ Builds a pyDataLog program from the logical_query and loads it. Then executes the query for the query_symbols. :param query_symbols: The symbols to be queried. :type query_symbols: list(SubSymbol) :param logical_query: :type: :return: """ helper_len = 0 tmp = None if not query_symbols: return None if coeff_expr is None: helper_len = len(query_symbols) helper_predicate = 'helper(' + ','.join([str(v) for v in query_symbols]) + ')' tmp = helper_predicate + " <= " + self.transform_query(logical_query) else: helper_len = len(query_symbols) + 1 syms = OrderedSet(query_symbols) syms.add('COEFF_EXPR') helper_predicate = 'helper(' + ','.join([str(v) for v in syms]) + ')' index_query = self.transform_query(logical_query) coeff_query = "(COEFF_EXPR == " + str(coeff_expr) + ")" if index_query is None: tmp = helper_predicate + " <= " + coeff_query else: tmp = helper_predicate + " <= " + " & ".join([index_query, coeff_query]) log.debug("pyDatalog query: " + tmp) pyDatalog.load(tmp) answer = pyDatalog.ask(helper_predicate) pyEngine.Pred.reset_clauses(pyEngine.Pred("helper", helper_len)) if answer is None: return [] return self.transform_answer(answer.answers)
def _flow_questions(): # next line is to normalize to question set. Otherwise it seems to be causing some issues with flows # since the flow is more native to Qset. Additional attributes in subclasses are just extras qset = QuestionSet.get(id=self.id) inline_ques = qset.questions_inline() OrderedSet(inline_ques) flow_questions = OrderedSet() for ques in inline_ques: flow_questions.append(ques) # boldly assuming subquests dont go map(lambda q: flow_questions.add( q), ques.direct_sub_questions()) # more than quest subquestion deep for present implemnt return flow_questions
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True): """ Read a file of tab-separated association data from ConceptNet, such as `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations, and a pandas Index of labels. If you specify `orig_index`, then the index of labels will be pre-populated with existing labels, and any new labels will get index numbers that are higher than the index numbers the existing labels use. This is important for producing a sparse matrix that can be used for retrofitting onto an existing dense labeled matrix (see retrofit.py). """ mat = SparseMatrixBuilder() labels = OrderedSet(orig_index) totals = defaultdict(float) with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip().split('\t') index1 = labels.add(replace_numbers(concept1)) index2 = labels.add(replace_numbers(concept2)) value = float(value_str) mat[index1, index2] = value mat[index2, index1] = value totals[index1] += value totals[index2] += value # Link nodes to their more general versions for label in labels: prefixes = list(uri_prefixes(label, 3)) if len(prefixes) >= 2: parent_uri = prefixes[-2] if parent_uri in labels: index1 = labels.index(label) index2 = labels.index(parent_uri) mat[index1, index2] = 1 mat[index2, index1] = 1 totals[index1] += 1 totals[index2] += 1 # add self-loops on the diagonal with equal weight to the rest of the row if self_loops: for key, value in totals.items(): mat[key, key] = value shape = (len(labels), len(labels)) index = pd.Index(labels) return mat.tocsr(shape), index
class Ellipse(Shape): rx = 0 ry = 0 def __init__(self, midpoint, rx, ry): super(Ellipse, self).__init__() self.midpoint = midpoint self.rx = int(rx) self.ry = int(ry) def from_rect(self, rect): ry = int(rect.height/2) rx = int(rect.width/2) midpoint = rect.midpoint return Ellipse(midpoint, rx, ry) def find_points(self): midx, midy = self.midpoint self._points = OrderedSet() for x in range(-1*self.rx, self.rx+1): for y in range(-1*self.ry, self.ry+1): if self.contains_point((int(x+midx), int(y+midy))): self._points.add((int(x+midx), int(y+midy))) def contains_point(self, p): x, y = p midx, midy = self.midpoint vx = ((float(x) - float(midx))**2 / float(self.rx)**2) vy = ((float(y) - float(midy))**2 / float(self.ry)**2) v = vx + vy return v <= 1.0
def ask(self, query_symbols, logical_query, coeff_expr=None): """ Builds a prolog query for a given set of query symbols, a logical query and a coefficient expression :param query_symbols: A Set of query (sub)symbols to be queried for :param logical_query: The logical query containing constants and presumably the query symbols :param coeff_expr: The coefficient expression for the given query :return: A list of tuples containg the answers for the query symbols """ if coeff_expr is None: lhs_rule = 'helper(' + ','.join([str(v) for v in query_symbols]) + ')' rule = lhs_rule + ":-" + self.transform_query(logical_query) + "." query = "query(" + lhs_rule + ")." else: syms = OrderedSet(query_symbols) syms.add('COEFF_EXPR') lhs_rule = 'helper(' + ','.join([str(v) for v in syms]) + ')' index_query = self.transform_query(logical_query) coeff_query = "COEFF_EXPR = " + str(coeff_expr) + "" query = "query(" + lhs_rule + ")." if index_query is None: rule = lhs_rule + " :- " + coeff_query + "." else: rule = lhs_rule + " :- " + " , ".join([index_query, coeff_query]) + "." answer = self.execute([rule, query]) answer_args = [] for key in answer.keys(): answer_args.append(key.args) # Query yields no result if answer.values()[0] == 0.0: return [] return self.transform_answer(answer_args)
class MasterTransducer(Transducer): """ A collection of transducers. This class is intended to be used as a singleton. """ def __init__(self): super().__init__([]) self.transducers = OrderedDict() self.groups = OrderedDict() self.selected = OrderedSet() self.parser = None def add(self, transducer, groups=None): """ Registers a `Transducer`. The transducers are guaranteed to execute in the order in which they are added. """ assert isinstance(transducer, Transducer) name = transducer.name assert name is not None assert name not in self.transducers.keys(), 'Duplicit transducer "{0}"'.format(name) self.transducers[name] = transducer for group in groups: assert group in self.groups.values() group.add(transducer) def add_group(self, name, description=None): """ Creates and registers a transducer group. :param name: the name of the group :param description: the description of the group :return: the constructed group instance """ assert name not in self.groups.keys(), 'Duplicit group "{0}"'.format(name) group = TransducerGroup(name, description) self.groups[name] = group return group def add_arguments(self, parser): """ Registers command line arguments that control this `MasterTransducer` in an :py:mod:`ArgumentParser`. :param parser: an :py:mod:`ArgumentParser` """ assert isinstance(parser, ArgumentParser) self.parser = parser group_names = list(self.groups.keys()) parser.add_argument('--group', '-g', nargs='+', action='append', choices=group_names, metavar='G', help=_( 'Enables the transducer group G. Combine with --help to show detailed information. Available groups: {0}').format( ', '.join(group_names))) transducer_names = list(self.transducers.keys()) parser.add_argument('--transducer', '-t', nargs='+', action='append', choices=transducer_names, metavar='T', help=_( 'Enables the transducer T. Combine with --help to show detailed information. Available transducers: {0}').format( ', '.join(transducer_names))) def configure(self, args, file=sys.stdout): """ Configures this `MasterTransducer` using the arguments parsed by an :py:mod:`ArgumentParser`. :param args: command line arguments parsed by an :py:mod:`ArgumentParser` :param file: the file to print help string to """ self.selected = OrderedSet() if args.group: for group_name in chain.from_iterable(args.group): group = self.groups[group_name] if args.help: self.parser.print_help(file) file.write('\n') group.print_help(file) self.parser.exit() for transducer in group.transducers: self.selected.add(transducer) if args.transducer: for transducer_name in chain.from_iterable(args.transducer): transducer = self.transducers[transducer_name] if args.help: self.parser.print_help(file) file.write('\n') transducer.print_help(file) self.parser.exit() self.selected.add(transducer) if len(self.selected) == 0: # If no transducer is selected explicitly, all transducers are used. self.selected = self.transducers.values() @overrides def substitute(self, string, indices): """ Translates a string using the selected transducers. """ for transducer in self.selected: string, indices = transducer.substitute(string, indices) return string, indices
def test_tuples(): set1 = OrderedSet() tup = ('tuple', 1) set1.add(tup) assert set1.index(tup) == 0 assert set1[0] == tup
class TableShaper(object): #------------------------- # unfold #----------------- def unfold(self, in_path_or_2d_array, col_name_to_unfold, col_name_unfold_values, out_method=OutMethod.STDOUT, constant_cols=None, new_col_names_col=None): ''' Unfold (reshape) data frame like the following example: ====== ======== ============ ========= ====== userId question questionType timeAdded answer ====== ======== ============ ========= ====== 10 DOB pullDown Jun2010 1983 10 gender radio May2011 F 20 DOB pullDown Jun2010 1980 20 gender radio May2011 M ... Let the unfold column be 'question', and the 'constants' columns be 'questionType' and 'timeAdded'. You could call the function like this: unfold('/tmp/in.csv', col_name_to_unfold='question' col_name_unfold_values='answer' constant_cols=['questionType','timeAdded']) The reshaped table looks like this: ======== ============ ========= == == question questionType timeAdded v1 v2 ======== ============ ========= == == DOB pullDown June2010 1983 1980 gender radio May2011 F M Each line is now one question. All answers to one question are columns in that question's row. It is an error to have inconsistencies in the constants-columns. For instance, if the original row "20 DOB pullDown..." had been "20 DOB radio" an error would have been raised. All constant-col field values for the same question (in different rows of the original) must match. Another way to call the function controls the names of the new columns. One column can be specified to provide the column headers: unfold('/tmp/in.csv', col_name_to_unfold='question' col_name_unfold_values='answer' constant_cols=['questionType','timeAdded'], new_col_names_col='userId) The reshaped table would look like this: ======== ============ ========= == == question questionType timeAdded 10 20 ======== ============ ========= == == DOB pullDown June2010 1983 1980 gender radio May2011 F M I.e. the user id values are used as the column headers of the new table. To have the function behave like an iterator (each item will be an array with one row of the reshaped table): it = unfold('/tmp/in.csv', col_name_to_unfold='question' col_name_unfold_values='answer' constant_cols=['questionType','timeAdded'], out_method=OutMethod.ITERATOR) for row in it: print(row) To write the output to a file: unfold('/tmp/in.csv', col_name_to_unfold='question' col_name_unfold_values='answer' constant_cols=['questionType','timeAdded'], new_col_names_col='userId, out_method=OutMethod('/tmp/trash.csv') :param in_path_or_2d_array: location of input CSV file, or an array of arrays. First row must be column names. :type in_path_or_2d_array: {string | [[]]} :param col_name_to_unfold: name of the column to unfold into columns :type col_name_to_unfold: string :param col_name_unfold_values: column name of the unfold values, i.e. the values in rows under the new columns :type col_name_unfold_values: string :param out_method: where to put the output CSV. If omitted, new table is written to stdout. :type out_method: OutMethod :param constant_cols: names of columns that are to be retained :type constant_cols: {None | [string]} :param new_col_names_col: name of column to use for column names of new columns :type new_col_names_col: {None | string} ''' # Error checking and initializations: if type(col_name_to_unfold) != str: raise ValueError('Must name column that is to be unfolded') else: self.col_name_to_unfold = col_name_to_unfold if new_col_names_col is not None and type(new_col_names_col) != str: raise ValueError('New-column prefix must be a string, was %s' % new_col_names_col) self.new_col_names_col = new_col_names_col if new_col_names_col is None: # No col specified to provide column headers # for new columns: # String for creating names for the new columns. # The string is prefixed to 1,2,3,...: 'v' for 'value': self.new_col_prefix = 'v' if constant_cols is not None: if type(constant_cols) != list: raise ValueError('Parameter constant_cols must be None or a list of column names.') self.constant_cols = constant_cols else: # constant_cols is None: self.constant_cols = [] self.out_method = out_method self.col_name_unfold_values = col_name_unfold_values # Place to accumulated the unfolded values: self.unfolded_values_dict = OrderedDict() # Place to hold the columns that are constant: self.const_col_dict = OrderedDict() # Place to hold names for new columns: self.new_col_names = OrderedSet() try: if type(in_path_or_2d_array) == str: # Get in-table from a file: in_fd = open(in_path_or_2d_array, 'r') reader = csv.reader(in_fd, delimiter=',') else: # Get in-table from a 2d array: reader = iter(in_path_or_2d_array) in_fd = None # Look at in-table's header line and get various # constants initialized: self.header = self.process_in_header_line(reader) # Read the rows and create in-memory representation # of transformed structure: for row in reader: # Field value of the unfold-column that is key of rows in new tbl # e.g. 'DOB' or 'gender': unfold_col_value = row[self.col_indx_to_unfold] # Encountered thiS key (i.e. unfold-col value) before? # If not, init with empty array of that key's value for # the subject who is represented by this row. # We'll end up with this: {'DOB' : ['1983', '1980'], 'gender' : ['M','F']}: collected_values = self.unfolded_values_dict.get(unfold_col_value, []) # Value of this unfold-key in this row (e.g. '1983' or 'M'): unfold_value = row[self.col_indx_of_values] collected_values.append(unfold_value) self.unfolded_values_dict[unfold_col_value] = collected_values # Now take care of constant columns. # For each unique value of the column that # is being unfolded, constant columns must # be unique. Example to end up with: # # question questionType answer1 answer2 # -------------------------------------------- # DOB pullDown 1980 1983 # gender radio F M # # Cannot have original table contain 'pullDown' for # some DOB row, and 'radio' for another. This won't # work as an original: # subject question answer questionType # ----------------------------------------- # subject1 DOB 1980 pullDown # subject1 gender F radio # subject2 DOB 1983 radio # subject2 gender M radio # for col_num in range(len(row)): try: col_name = self.header[col_num] except IndexError: raise ValueError('Row %s has more columns than header (%s)' % (col_num, self.header)) col_value = row[col_num] # Is this column constant for a given pivot column value? if col_name in self.constant_cols: # Dict: # {(<unfold-col-value, constant_col_name) : constant_col_value} # I.e. for each of the values in the column to be unfolded, # each constant column has the same value, else something is wrong. # Check whether we already encountered the value in the current # row's unfold-value; if not init, if yes, ensure that this # constant-col's value in the current row is the same as in # other rows in which the unfold-value is the same as in this row: const_values_dict_key = (unfold_col_value,col_name) col_constant = self.const_col_dict.get(const_values_dict_key, None) if col_constant is None: self.const_col_dict[const_values_dict_key] = col_value else: # Saw value for this column and pivot value earlier: if col_value != col_constant: raise ValueError("Column that is supposedly constant for a given pivot value is not: %s != %s" %\ (col_value, col_constant)) # Are we to use an existing column as source for # names of new columns? if self.new_col_names_col is not None: self.new_col_names.add(row[self.new_cols_col_indx]) finally: if type(in_path_or_2d_array) == str: in_fd.close() return(self.output_result()) # ---------------------------------- Private Methods --------------------- #------------------------- # create_out_header_row #----------------- def create_out_header_row(self, header): # Create CSV: col_name_to_unfold, constant_cols[0], constant_cols[1], ..., unfolded-values-columns # Find the longest row of unfolded values, so that we can pad # them with zeroes: unfolded_max_len = 0 for unfolded_value in self.unfolded_values_dict.keys(): num_unfolded_values = len(self.unfolded_values_dict[unfolded_value]) unfolded_max_len = max(num_unfolded_values, unfolded_max_len) # Header: start with the column name that was unfolded: header = [self.col_name_to_unfold] # Continue with any columns that were constant for # any given unfold-value: header.extend(self.constant_cols) # Finally: invent names for all the unfolded values # that are now columns; or the caller specified a # self.new_col_names_col, and we accumulated values # from that column-name-providing column in self.new_col_names if self.new_col_names_col is not None: for new_col_header in self.new_col_names: header.append(new_col_header) else: # Invent names for the new columns: v<n>: for indx in range(unfolded_max_len): header.append('%s%s' % (self.new_col_prefix, indx)) return (header, unfolded_max_len) #------------------------- # process_in_header_line #----------------- def process_in_header_line(self, reader): ''' Given a csv- or excel reader that is pointed to table file, read the first row, which is expected to be the table header. Error-check, and return that header. :param reader: object providing the file-like API :type reader: csv.Reader ''' header = reader.next() # If we are to use the value of a column to name # new columns created for the unfolded values, # ensure the col exists: if self.new_col_names_col is not None: try: self.new_cols_col_indx = header.index(self.new_col_names_col) except IndexError: raise ValueError('Specified column %s as source of col names for unfolded columns, but no such column exists' % self.new_col_names_col) else: self.new_cols_col_indx = None try: # Does the column to be unfolded exist? # in the running example: 'question': self.col_indx_to_unfold = header.index(self.col_name_to_unfold) except IndexError: raise ValueError('The column to unfold (%s) does not appear in the table header (%s)' % (self.col_name_to_unfold, header)) try: # Does the column with the unfold-values # exist? In the running example: 'answer': self.col_indx_of_values = header.index(self.col_name_unfold_values) except IndexError: raise ValueError('The column of unfold values (%s) does not appear in the table header (%s)' % (self.col_name_unfold_values, header)) return header #------------------------- # output_result #----------------- def output_result(self): # Do the writing-out, to STDOUT, a file, or # by building an internal 2d array of the result # and returning an iterator to it: try: # Will be None if iterator requested: (out_fd, writer) = self.make_writer(self.out_method) (header, unfolded_max_len) = self.create_out_header_row(self.header) if self.out_method == OutMethod.ITERATOR: result = [header] else: writer.writerow(header) # Each new row is about one of the unfolded values, # like 'DOB' or 'gender' in the example: for unfold_key in self.unfolded_values_dict.keys(): new_row = [unfold_key] # Add constant-column values if any: for col_name in self.constant_cols: # The constant-column value for the current # rows value in the column being unfolded is # kept in self.const_col_dict. Keys are tuples: # (unfold_col_value, constant_col_name): const_col_key = (unfold_key, col_name) col_constant = self.const_col_dict[const_col_key] new_row.append(col_constant) unfolded_values = self.unfolded_values_dict[unfold_key] # Fill short-row vectors with zeros: unfolded_values = unfolded_values + (unfolded_max_len - len(unfolded_values))*[0] new_row.extend(unfolded_values) if self.out_method == OutMethod.ITERATOR: result.append(new_row) else: writer.writerow(new_row) finally: if self.out_method == OutMethod.ITERATOR: return(iter(result)) elif self.out_method != OutMethod.STDOUT: out_fd.close() # ---------------------------------- Support Methods --------------------- #------------------------- # make_writer #----------------- def make_writer(self, out_method): # Obtain a csv writer object if function is # not called as a generator: if out_method != OutMethod.ITERATOR and out_method != OutMethod.STDOUT: fd = open(out_method.FILE, 'w') elif out_method == OutMethod.STDOUT: fd = sys.stdout else: fd = writer = None if fd is not None: writer = csv.writer(fd) return (fd,writer)
def assertions_to_sql_csv(msgpack_filename, output_dir): output_nodes = output_dir + '/nodes.csv' output_edges = output_dir + '/edges.csv' output_relations = output_dir + '/relations.csv' output_sources = output_dir + '/sources.csv' output_edge_sources = output_dir + '/edge_sources.csv' output_node_prefixes = output_dir + '/node_prefixes.csv' output_features = output_dir + '/edge_features.csv' node_list = OrderedSet() source_list = OrderedSet() assertion_list = OrderedSet() relation_list = OrderedSet() seen_prefixes = set() edge_file = open(output_edges, 'w', encoding='utf-8') edge_source_file = open(output_edge_sources, 'w', encoding='utf-8') node_prefix_file = open(output_node_prefixes, 'w', encoding='utf-8') feature_file = open(output_features, 'w', encoding='utf-8') for assertion in read_msgpack_stream(msgpack_filename): if assertion['uri'] in assertion_list: continue assertion_idx = assertion_list.add(assertion['uri']) rel_idx = relation_list.add(assertion['rel']) start_idx = node_list.add(assertion['start']) end_idx = node_list.add(assertion['end']) source_indices = [] sources = assertion['sources'] for source in sources: for sourceval in sorted(source.values()): source_idx = source_list.add(sourceval) source_indices.append(source_idx) jsondata = json.dumps(assertion, ensure_ascii=False, sort_keys=True) weight = assertion['weight'] write_row( edge_file, [assertion_idx, assertion['uri'], rel_idx, start_idx, end_idx, weight, jsondata] ) for node in (assertion['start'], assertion['end'], assertion['dataset']): write_prefixes(node_prefix_file, seen_prefixes, node_list, node) for source_idx in sorted(set(source_indices)): write_row(edge_source_file, [assertion_idx, source_idx]) if assertion['rel'] in SYMMETRIC_RELATIONS: features = [(0, start_idx), (0, end_idx)] else: features = [(1, start_idx), (-1, end_idx)] for direction, node_idx in features: write_row(feature_file, [rel_idx, direction, node_idx, assertion_idx]) edge_file.close() edge_source_file.close() node_prefix_file.close() write_ordered_set(output_nodes, node_list) write_ordered_set(output_sources, source_list) write_relations(output_relations, relation_list)
class ColumnName(object): """A ColumnName is a string naming the Column and optionally a set of qualifiers. In SQL, ColumnName qualifiers are usually table names or table aliases.""" def __init__(self, name, qualifiers=None, allow_wildcard=False): self.original_token = name name_parts = name.split('.') self.name = name_parts[-1] self.is_wildcard = False if allow_wildcard and is_wildcard_identifier(self.name): self.is_wildcard = True elif not is_valid_identifier(self.name): raise InvalidColumnNameError(self.name) self.qualifiers = OrderedSet(qualifiers or []) if len(name_parts) > 1: self.qualifiers.add('.'.join(name_parts[:-1]).lower()) @property def name(self): return self._name @name.setter def name(self, value): self._cased_name = value self._name = value.lower() @property def qualifiers(self): return self._qualifiers @qualifiers.setter def qualifiers(self, value): self._qualifiers = OrderedSet([qual.lower() for qual in value]) def __eq__(self, other): if type(other) is type(self): return (self.name == other.name and other.qualifiers == self.qualifiers) return False def __gt__(self, other): if type(other) is type(self): return ((self.name == other.name or self.is_wildcard) and self.qualifiers <= other.qualifiers) return False def __lt__(self, other): return other > self def __ge__(self, other): return (self > other or self == other) def __le__(self, other): return (self < other or self == other) def __ne__(self, other): return not self == other def __hash__(self): return hash(self.name) def __str__(self): return self._cased_name def __repr__(self): return '<ColumnName ' + '.'.join([qualifiers_to_str(self.qualifiers), self.name]) + '>' def match(self, *right_column_names): """Given a list of ColumnNames, return a list of those that match this ColumName. This operation is not commutative. That is, A.match(B) =/=> B.match(A).""" return [col for col in right_column_names if self >= col]
def findSubGraphs(self): """Find equivalent path from two equivalent nodes For each node hash it gets all of the BB and try to build path from each pair of them The result is put in a dual dictionary that has the starting node hash as the first key, the path hash as the second key and the equivalent pathS as a list of sets(containing nodes) """ matchedPathsWithDifferentLengths = 0 for i in self.M.keys(): for z in range(0,len(self.M[i])-1): for j in self.M[i][z+1:]: #pick one from the second node onward visited1=set() visited2=set() q1=Queue.Queue() #add the first and n node to tmp q1.put((self.M[i][z],j)) path1=OrderedSet() path2=OrderedSet() path1_bis=OrderedSet() path2_bis=OrderedSet() path1NodeHashes = {} path1.add(self.M[i][z]) path2.add(j) path1Str='' path2Str='' path1NodeHashes[self.M[i][z]]=self.G[(self.M[i][z])].ctx.hash_itype2 pathHash1= hashlib.sha1() while not q1.empty(): # for each matching pair from tmp x,y = q1.get(block = False) tmp_visited2=set() for l in self.G[x].succs : matchedbyHash = False if (l not in visited1) and (l !=x) and (l not in path1): visited1.add(l) tmp_visited2Backup=tmp_visited2 hashType = 'hash_itype1' matchedbyHash, m, tmp_visited2 = self.findMatchInSuccs( l, y, hashType, visited2, tmp_visited2, path2) if not matchedbyHash: hashType = 'hash_itype2' tmp_visited2= tmp_visited2Backup matchedbyHash, m, tmp_visited2 = self.findMatchInSuccs( l, y, hashType, visited2, tmp_visited2, path2) if not matchedbyHash: hashType = 'freq' tmp_visited2= tmp_visited2Backup matchedbyHash, m, tmp_visited2 = self.findMatchInSuccs( l, y, hashType, visited2, tmp_visited2, path2) if matchedbyHash: path1NodeHashes[l] = self.G[l][hashType] path1.add(l) path2.add(m) q1.put((l,m)) visited2.add(m) visited2.update(tmp_visited2) if (len(path1) != len(path2)): matchedPathsWithDifferentLengths += 1 else: path1_bis, path2_bis = self.makeSubgraphSingleEntryPoint(path1, path2) if len(path1) >1: for kk in path1: path1Str+=path1NodeHashes[kk] pathHash1.update(path1Str) a=pathHash1.hexdigest() if not(self.pathPerNodeHashFull.has_key(i)) or (not( self.pathPerNodeHashFull[i].has_key(a))): self.pathPerNodeHashFull[i][a]=[] duplicate1 = False duplicate2 = False listPath1 = list(path1) listPath2 = list(path2) for zz in self.pathPerNodeHashFull[i][a]: if listPath1 == zz: duplicate1 = True if listPath2 == zz: duplicate2 = True if not duplicate1: self.pathPerNodeHashFull[i][a].append(list(listPath1)) if not duplicate2: self.pathPerNodeHashFull[i][a].append(list(listPath2)) if len(path1_bis) >1: path1Str = '' for kk in path1_bis: path1Str+=path1NodeHashes[kk] pathHash1.update(path1Str) a=pathHash1.hexdigest() if not(self.pathPerNodeHash.has_key(i)) or (not( self.pathPerNodeHash[i].has_key(a))): self.pathPerNodeHash[i][a]=[] duplicate1 = False duplicate2 = False listPath1 = list(path1_bis) listPath2 = list(path2_bis) for zz in self.pathPerNodeHash[i][a]: if listPath1 == zz: duplicate1 = True if listPath2 == zz: duplicate2 = True if not duplicate1: self.pathPerNodeHash[i][a].append(list(listPath1)) if not duplicate2: self.pathPerNodeHash[i][a].append(list(listPath2))
class Rectangle(Shape): width = 0 height = 0 def __init__(self, midpoint, width, height): super().__init__() self.midpoint = midpoint midx, midy = midpoint self.width = width self.height = height ul_x = -1 * int(self.width / 2) + midx ul_y = -1 * int(self.height / 2) + midy self.ul = (ul_x, ul_y) def find_points(self): startx, starty = self.ul self._points = OrderedSet() for x in range(int(startx), int(startx)+self.width): for y in range(int(starty), int(starty)+self.height): self._points.add((int(x), int(y))) def area(self): return self.width * self.height def grow(self, direction): if direction == Direction.north: old_x, old_y = self.ul self.ul = (old_x, old_y - 1) new_x, new_y = self.ul self.height += 1 new_midy = new_y + int(self.height/2) self.midpoint = (self.midpoint[0], new_midy) elif direction == Direction.west: old_x, old_y = self.ul self.ul = (old_x - 1, old_y) new_x, new_y = self.ul self.width += 1 new_midx = new_x + int(self.width/2) self.midpoint = (new_midx, self.midpoint[1]) elif direction == Direction.south: self.height += 1 x, y = self.ul new_midy = y + int(self.height/2) self.midpoint = (self.midpoint[0], new_midy) elif direction == Direction.east: self.width += 1 x, y = self.ul new_midx = x + int(self.width/2) self.midpoint = (new_midx, self.midpoint[1]) self.dirty = True def move(self, direction): dx, dy = direction ul_x, ul_y = self.ul mx, my = self.midpoint self.ul = ul_x+dx, ul_y+dy self.midpoint = mx+dx, my+dy self.dirty=True def edge(self, direction): """points on the border along the <direction> edge""" points = set() if direction == Direction.north: ul_x, y = self.ul for x in range(ul_x, ul_x + self.width): points.add((x, y - 1)) elif direction == Direction.south: ll_x, ul_y = self.ul y = ul_y + self.height - 1 for x in range(ll_x, ll_x + self.width): points.add((x, y + 1)) elif direction == Direction.east: ul_x, ul_y = self.ul x = ul_x + self.width - 1 for y in range(ul_y, ul_y + self.height): points.add((x + 1, y)) elif direction == Direction.west: x, ul_y = self.ul for y in range(ul_y, ul_y + self.height): points.add((x-1, y)) return points
def build_features_from_conceptnet_table(filename): mat = SparseMatrixBuilder() concept_labels = OrderedSet() feature_labels = OrderedSet() with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip().split('\t') concept1 = replace_numbers(concept1) concept2 = replace_numbers(concept2) value = float(value_str) if relation in SYMMETRIC_RELATIONS: feature_pairs = [] if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept1), relation), concept2) ) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept2), relation), concept1) ) else: if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} -'.format(uri_prefix(concept1), relation), concept2) ) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('- {} {}'.format(uri_prefix(concept2), relation), concept1) ) feature_counts = defaultdict(int) for feature, concept in feature_pairs: feature_counts[feature] += 1 for feature, concept in feature_pairs: prefixes = list(uri_prefixes(concept, 3)) if feature_counts[feature] > 1: for prefix in prefixes: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value # Link nodes to their more general versions for concept in concept_labels: prefixes = list(uri_prefixes(concept, 3)) for prefix in prefixes: auto_features = [ '{} {} ~'.format(prefix, 'SimilarTo'), '{} {} ~'.format(prefix, 'RelatedTo'), '{} {} -'.format(prefix, 'FormOf'), '- {} {}'.format(prefix, 'FormOf'), ] for feature in auto_features: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value shape = (len(concept_labels), len(feature_labels)) c_index = pd.Index(concept_labels) f_index = pd.Index(feature_labels) return mat.tocsr(shape), c_index, f_index
class Featurizer(object): # # Converts chorales into a matrix of feature indices. Each vector in a matrix represents a specific beat within # a chorale. Note that indices are 1-based to comply with Torch. # # Initialize with the number of scores to analyze def __init__(self): self.percentage_train = 0.8 # percentage of scores to be in the test split self.percentage_dev = 0.5 # percentage of the test set to be used a dev set self.data_dir = "raw_data/" self.output_dir = "data/" # Features self.keys = OrderedSet() self.modes = OrderedSet() self.times = OrderedSet() self.beats = OrderedSet() self.offsets = OrderedSet() self.cadence_dists = OrderedSet() self.cadences = OrderedSet() self.pitches = OrderedSet() self.intervals = OrderedSet() self.roots = OrderedSet() self.inversions = OrderedSet() self.bases = OrderedSet() self.altos = OrderedSet() self.tenors = OrderedSet() # THIS ORDER MATTERS self.input_features = [self.keys, self.modes, self.times, self.beats, self.offsets, self.cadence_dists, \ self.cadences, self.pitches, self.intervals, self.intervals, self.roots, \ self.bases, self.inversions] self.output_features = [self.roots, self.bases, self.inversions, self.altos, self.tenors] # Collect all preprocessed scores @timing def gather_scores(self): from os import listdir self.original = [] for f in glob(self.data_dir + "*.xml"): score = converter.parse(f) if score.parts[0].quarterLength > 300: # removes on excessively long score continue self.original.append(score) print "Gathered %d 4-part chorales." % len(self.original) return self.original # Create X and y matrices of features for each chorale @timing def analyze(self): print "Analyzing..." self.analyzed = [] # to save time, we store the related objects to a score for featurizing Xvalues, yvalues = [], [] # Create X and y matrices for each chorale for idx, score in enumerate(self.original): sys.stdout.write("Analyzing #%d \r" % (idx + 1)) sys.stdout.flush() # score-wide features S, A, T, B = getNotes(score.parts[0]), getNotes(score.parts[1]), getNotes(score.parts[2]), getNotes(score.parts[3]) assert len(S) == len(A) assert len(A) == len(T) assert len(T) == len(B) time_sig, key_sig = getTimeSignature(score.parts[0]), getKeySignature(score.parts[0]) key_obj = getKeyFromSignature(key_sig) tonic = key_obj.tonic.midi fermata_locations = map(hasFermata, S) # Input/target data for each chorale Xc, yc = [], [] # Create X vector and y output for index, n in enumerate(S): # [0]: Key v_key = key_sig.sharps self.keys.add(v_key) # [1]: Mode v_mode = key_sig.mode self.modes.add(v_mode) # [2]: Time v_time = (time_sig.numerator, time_sig.denominator) self.times.add(v_time) # [3]: Beat strength v_beat = n.beatStrength self.beats.add(n.beatStrength) # [4]: Offset end v_off_end = int(math.floor((len(S) - index) / 4.)) self.offsets.add(v_off_end) # [5]: Cadence distance v_cadence_dist = 0 if hasFermata(n) else fermata_locations[index:].index(True) self.cadence_dists.add(v_cadence_dist) # [6]: Is a point of cadence v_cadence = 1 if hasFermata(n) else 0 self.cadences.add(v_cadence) # [7]: Soprano pitch (relative to key signature) v_pitch = (n.midi - tonic) % 12 self.pitches.add(v_pitch) # [8]: Interval to previous melody note v_ibefore = S[index].midi - S[index - 1].midi if index > 0 else 'None' self.intervals.add(v_ibefore) # [9]: Interval to next melody note v_iafter = S[index + 1].midi - S[index].midi if index < len(S) - 1 else 'None' self.intervals.add(v_iafter) # [10]: root at time t-1 # [11]: base at time t-1 # [12]: inversion at time t-1 timetminus1 = yc[-1] if len(yc) > 0 else ('*padding*', '*padding*', '*padding*') v_root_prev = timetminus1[0] # NOTE THE ORDER v_base_prev = timetminus1[1] v_inv_prev = timetminus1[2] # Output vector # [0]: root # [1]: base # [2]: inversion consonance = [1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0] # see gct module v_root, v_inv, v_base = gct.GCT(tonic, consonance, chord.Chord([B[index], T[index], A[index], S[index]])) self.roots.add(v_root) self.bases.add(v_base) self.inversions.add(v_inv) # [3]: Alto pitch (relative to key signature) v_alto = (A[index].midi - tonic) % 12 self.altos.add(v_alto) # [4]: Tenor pitch (relative to key signature) v_tenor = (T[index].midi - tonic) % 12 self.tenors.add(v_tenor) # Input vector input_vec = [v_key, v_mode, v_time, v_beat, v_off_end, v_cadence_dist, v_cadence, \ v_pitch, v_ibefore, v_iafter, v_root_prev, v_base_prev, v_inv_prev] output_vec = [v_root, v_base, v_inv, v_alto, v_tenor] Xc.append(input_vec) yc.append(output_vec) self.analyzed.append((Xc, yc, score, idx)) Xvalues.append(Xc) yvalues.append(yc) # Add the 'n/a' option for feature_space in self.input_features + self.output_features: feature_space.add('*padding*') freezeObject(Xvalues, 'Xvalues') freezeObject(yvalues, 'yvalues') freezeObject(self.roots, 'roots') freezeObject(self.bases, 'bases') freezeObject(self.inversions, 'inversions') freezeObject(self.altos, "alto_range") freezeObject(self.tenors, "tenor_range") freezeObject(self.input_features, "input_features") freezeObject(self.output_features, "output_features") # After calling self.analyze, this converts the X and y matrices to vectors of feature indices # As scores are examined, the indices of output chords are generated. @timing def featurize(self): print "Featurizing..." self.featurized = [] # Set the indices self.input_indices = [] max_index = 1 for feature_space in self.input_features: self.input_indices.append((max_index, max_index + len(feature_space) - 1)) max_index += len(feature_space) for Xc, yc, score, idx in self.analyzed: Xcf, ycf = [], [] for vec in Xc: fvec = [] for fidx, feature_space in enumerate(self.input_features): f_feature = feature_space.index(vec[fidx]) fvec.append(f_feature + self.input_indices[fidx][0]) Xcf.append(fvec) for vec in yc: fvec = [] for fidx, feature_space in enumerate(self.output_features): fvec.append(feature_space.index(vec[fidx]) + 1) ycf.append(fvec) self.featurized.append((npy.matrix(Xcf), npy.matrix(ycf), score, idx)) # Verify that the feature indices are all in the right ranges @timing def verify(self): print "Verifying indices..." for Xcf, ycf, score, idx in self.featurized: for fidx in range(Xcf.shape[1]): assert Xcf[:, fidx].min() >= self.input_indices[fidx][0] assert Xcf[:, fidx].max() <= self.input_indices[fidx][1] if fidx > 0: assert Xcf[:, fidx - 1].max() < Xcf[:, fidx].min() for fidx in range(ycf.shape[1]): assert ycf[:, fidx].min() >= 1 assert ycf[:, fidx].max() <= len(self.output_features[fidx]) # Split the chorales into training, dev, and test groups @timing def train_test_split(self): self.train, remaining = split(self.featurized, self.percentage_train) self.dev, self.test = split(remaining, self.percentage_dev) print "Training, dev, and tests sets contain %d, %d, %d chorales, respectively." \ % (len(self.train), len(self.dev), len(self.test)) # Create the aggregate datasets @timing def aggregrate(self): stack = lambda x1, x2: npy.vstack((x1, x2)) self.trainX, self.trainy = [x for (x, y, sc, idx) in self.train], [y for (x, y, sc, idx) in self.train] self.devX, self.devy = [x for (x, y, sc, idx) in self.dev], [y for (x, y, sc, idx) in self.dev] self.testX, self.testy = [x for (x, y, sc, idx) in self.test], [y for (x, y, sc, idx) in self.test] self.trainXall, self.trainyall = reduce(stack, self.trainX), reduce(stack, self.trainy) self.devXall, self.devyall = reduce(stack, self.devX), reduce(stack, self.devy) self.testXall, self.testyall = reduce(stack, self.testX), reduce(stack, self.testy) self.dataXall = stack(stack(self.trainXall, self.devXall), self.testXall) self.datayall = stack(stack(self.trainyall, self.devyall), self.testyall) # Write @timing def write(self): print "Writing to %s folder." % self.output_dir with h5py.File(self.output_dir + "chorales.hdf5", "w", libver="latest") as f: # Write accumulated chorales f.create_dataset("Xtrain", self.trainXall.shape, dtype="i", data=self.trainXall) f.create_dataset("ytrain", self.trainyall.shape, dtype="i", data=self.trainyall) f.create_dataset("Xdev", self.devXall.shape, dtype="i", data=self.devXall) f.create_dataset("ydev", self.devyall.shape, dtype="i", data=self.devyall) f.create_dataset("Xtest", self.testXall.shape, dtype="i", data=self.testXall) f.create_dataset("ytest", self.testyall.shape, dtype="i", data=self.testyall) # Write every chorale into train/dev/test sets with open('data/chorale_index.txt', 'w') as m: m.write("TRAINING SET\n") for idx, (X, y) in enumerate(zip(self.trainX, self.trainy)): f.create_dataset("train/chorale%d_X" % idx, X.shape, dtype="i", data=X) f.create_dataset("train/chorale%d_y" % idx, y.shape, dtype="i", data=y) m.write("%d\t %s\n" % (idx, self.train[idx][2].metadata.title)) m.write("VALIDATION SET\n") for idx, (X, y) in enumerate(zip(self.devX, self.devy)): f.create_dataset("dev/chorale%d_X" % idx, X.shape, dtype="i", data=X) f.create_dataset("dev/chorale%d_y" % idx, y.shape, dtype="i", data=y) m.write("%d\t %s\n" % (idx, self.dev[idx][2].metadata.title)) m.write("TEST SET\n") for idx, (X, y) in enumerate(zip(self.testX, self.testy)): f.create_dataset("test/chorale%d_X" % idx, X.shape, dtype="i", data=X) f.create_dataset("test/chorale%d_y" % idx, y.shape, dtype="i", data=y) m.write("%d\t %s\n" % (idx, self.test[idx][2].metadata.title)) # Write every chorale individually for Xcf, ycf, score, idx in self.featurized: f.create_dataset("chorale%d_X" % idx, Xcf.shape, dtype="i", data=Xcf) f.create_dataset("chorale%d_y" % idx, ycf.shape, dtype="i", data=ycf) # Save test scores for future use test_scores = [sc for (x, y, sc, idx) in self.test] test_dir = '/Users/hzabriskie/Documents/Thesis/thesis/bach_code/data/test_scores' if not os.path.exists(test_dir): os.makedirs(test_dir) for idx, sc in enumerate(test_scores): sc.write('musicxml', test_dir + '/' + str(idx) + '.xml') def run(self): self.gather_scores() self.analyze() self.featurize() self.verify() self.train_test_split() self.aggregrate() self.write()
class Featurizer(object): # # Converts chorales into a matrix of feature indices. Each vector in a matrix represents a specific beat within # a chorale. Note that indices are 1-based to comply with Torch. # # Initialize with the number of scores to analyze def __init__(self): self.percentage_train = 0.8 # percentage of scores to be in the test split self.percentage_dev = 0.5 # percentage of the test set to be used a dev set self.data_dir = "raw_data/" self.output_dir = "data/" # Features self.keys = OrderedSet() self.modes = OrderedSet() self.times = OrderedSet() self.beats = OrderedSet() self.offsets = OrderedSet() self.cadence_dists = OrderedSet() self.cadences = OrderedSet() self.pitches = OrderedSet() self.intervals = OrderedSet() self.numerals = OrderedSet() self.inversions = OrderedSet() self.altos = OrderedSet() self.tenors = OrderedSet() # THIS ORDER MATTERS self.input_features = [self.keys, self.modes, self.times, self.beats, self.offsets, self.cadence_dists, \ self.cadences, self.pitches, self.intervals, self.intervals, self.numerals, self.inversions] self.output_features = [self.numerals, self.inversions, self.altos, self.tenors] # Collect all preprocessed scores @timing def gather_scores(self): from os import listdir self.original = [] for f in glob(self.data_dir + "*.xml"): self.original.append(converter.parse(f)) print "Gathered %d 4-part chorales." % len(self.original) return self.original # Create X and y matrices of features for each chorale @timing def analyze(self): print "Analyzing..." self.analyzed = [] # to save time, we store the related objects to a score for featurizing Xvalues, yvalues = [], [] # Create X and y matrices for each chorale for idx, score in enumerate(self.original): sys.stdout.write("Analyzing #%d \r" % (idx + 1)) sys.stdout.flush() # score-wide features S, A, T, B = getNotes(score.parts[0]), getNotes(score.parts[1]), getNotes(score.parts[2]), getNotes(score.parts[3]) assert len(S) == len(A) assert len(A) == len(T) assert len(T) == len(B) time_sig, key_sig = getTimeSignature(score.parts[0]), getKeySignature(score.parts[0]) key_obj = getKeyFromSignature(key_sig) fermata_locations = map(hasFermata, S) # Input/target data for each chorale Xc, yc = [], [] # Create X vector and y output for index, n in enumerate(S): # [0]: Key v_key = key_sig.sharps self.keys.add(v_key) # [1]: Mode v_mode = key_sig.mode self.modes.add(v_mode) # [2]: Time v_time = (time_sig.numerator, time_sig.denominator) self.times.add(v_time) # [3]: Beat strength v_beat = n.beatStrength self.beats.add(n.beatStrength) # [4]: Offset end v_off_end = int(math.floor((len(S) - index) / 4.)) self.offsets.add(v_off_end) # [5]: Cadence distance v_cadence_dist = 0 if hasFermata(n) else fermata_locations[index:].index(True) self.cadence_dists.add(v_cadence_dist) # [6]: Is a point of cadence v_cadence = 1 if hasFermata(n) else 0 self.cadences.add(v_cadence) # [7]: Pitch v_pitch = n.midi self.pitches.add(v_pitch) # [8]: Interval to previous melody note v_ibefore = S[index].midi - S[index - 1].midi if index > 0 else 'None' self.intervals.add(v_ibefore) # [9]: Interval to next melody note v_iafter = S[index + 1].midi - S[index].midi if index < len(S) - 1 else 'None' self.intervals.add(v_iafter) # [10]: Numeral at time t-1 # [11]: Inversion at time t-1 timetminus1 = yc[-1] if len(yc) > 0 else ('None', 'None') v_num_prev = timetminus1[0] # Roman numeral v_inv_prev = timetminus1[1] # inversion # Intentionally not adding this self.numerals and self.inversions # Output vector # [0]: numeral # [1]: inversion v_num, v_inv = feat_harmony(S[index], A[index], T[index], B[index], key_obj) self.numerals.add(v_num) self.inversions.add(v_inv) # [2]: alto pitch v_alto = A[index].midi self.altos.add(v_alto) # [3]: tenor pitch v_tenor = T[index].midi self.tenors.add(v_tenor) # Input vector input_vec = [v_key, v_mode, v_time, v_beat, v_off_end, v_cadence_dist, v_cadence, \ v_pitch, v_ibefore, v_iafter, v_num_prev, v_inv_prev] output_vec = [v_num, v_inv, v_alto, v_tenor] Xc.append(input_vec) yc.append(output_vec) self.analyzed.append((Xc, yc, score, idx)) Xvalues.append(Xc) yvalues.append(yc) # Add the 'n/a' option self.numerals.add('None') self.inversions.add('None') self.intervals.add('None') freezeObject(Xvalues, 'Xvalues') freezeObject(yvalues, 'yvalues') # After calling self.analyze, this converts the X and y matrices to vectors of feature indices # As scores are examined, the indices of output chords are generated. @timing def featurize(self): print "Featurizing..." self.featurized = [] # Set the indices self.input_indices = [] max_index = 1 for feature_space in self.input_features: self.input_indices.append((max_index, max_index + len(feature_space) - 1)) max_index += len(feature_space) for Xc, yc, score, idx in self.analyzed: Xcf, ycf = [], [] for vec in Xc: fvec = [] for fidx, feature_space in enumerate(self.input_features): f_feature = feature_space.index(vec[fidx]) fvec.append(f_feature + self.input_indices[fidx][0]) Xcf.append(fvec) for vec in yc: fvec = [] for fidx, feature_space in enumerate(self.output_features): fvec.append(feature_space.index(vec[fidx]) + 1) ycf.append(fvec) self.featurized.append((npy.matrix(Xcf), npy.matrix(ycf), score, idx)) # Verify that the feature indices are all in the right ranges @timing def verify(self): print "Verifying indices..." for Xcf, ycf, score, idx in self.featurized: for fidx in range(Xcf.shape[1]): assert Xcf[:, fidx].min() >= self.input_indices[fidx][0] assert Xcf[:, fidx].max() <= self.input_indices[fidx][1] if fidx > 0: assert Xcf[:, fidx - 1].max() < Xcf[:, fidx].min() for fidx in range(ycf.shape[1]): assert ycf[:, fidx].min() >= 1 assert ycf[:, fidx].max() <= len(self.output_features[fidx]) # Split the chorales into training, dev, and test groups @timing def train_test_split(self): self.train, remaining = split(self.featurized, self.percentage_train) self.dev, self.test = split(remaining, self.percentage_dev) print "Training, dev, and tests sets contain %d, %d, %d chorales, respectively." \ % (len(self.train), len(self.dev), len(self.test)) # Create the aggregate datasets @timing def aggregrate(self): stack = lambda x1, x2: npy.vstack((x1, x2)) trainX, trainy = [x for (x, y, sc, idx) in self.train], [y for (x, y, sc, idx) in self.train] devX, devy = [x for (x, y, sc, idx) in self.dev], [y for (x, y, sc, idx) in self.dev] testX, testy = [x for (x, y, sc, idx) in self.test], [y for (x, y, sc, idx) in self.test] self.trainXall, self.trainyall = reduce(stack, trainX), reduce(stack, trainy) self.devXall, self.devyall = reduce(stack, devX), reduce(stack, devy) self.testXall, self.testyall = reduce(stack, testX), reduce(stack, testy) self.dataXall = stack(stack(self.trainXall, self.devXall), self.testXall) self.datayall = stack(stack(self.trainyall, self.devyall), self.testyall) # Write @timing def write(self): print "Writing to %s folder." % self.output_dir with h5py.File(self.output_dir + "chorales.hdf5", "w", libver="latest") as f: f.create_dataset("Xtrain", self.trainXall.shape, dtype="i", data=self.trainXall) f.create_dataset("ytrain", self.trainyall.shape, dtype="i", data=self.trainyall) f.create_dataset("Xdev", self.devXall.shape, dtype="i", data=self.devXall) f.create_dataset("ydev", self.devyall.shape, dtype="i", data=self.devyall) f.create_dataset("Xtest", self.testXall.shape, dtype="i", data=self.testXall) f.create_dataset("ytest", self.testyall.shape, dtype="i", data=self.testyall) with open('data/chorale_index.txt', 'w') as m: for Xcf, ycf, score, idx in self.featurized: f.create_dataset("chorale%d_X" % idx, Xcf.shape, dtype="i", data=Xcf) f.create_dataset("chorale%d_y" % idx, ycf.shape, dtype="i", data=ycf) m.write("%d\t %s\n" % (idx, score.metadata.title)) freezeObject(self.input_features, "input_features") freezeObject(self.input_indices, "input_indices") freezeObject(self.numerals, "numerals") freezeObject(self.inversions, "inversions") freezeObject(self.altos, "alto_range") freezeObject(self.tenors, "tenor_range") def run(self): self.gather_scores() self.analyze() self.featurize() self.verify() self.train_test_split() self.aggregrate() self.write()
def test_tuples(): set1 = OrderedSet() tup = ('tuple', 1) set1.add(tup) eq_(set1.index(tup), 0) eq_(set1[0], tup)