Example #1
0
def crawl_from_base(base_link, num_of_links):
    links = OrderedSet()
    link_index = 0

    while len(links) < num_of_links:
        # Get website data and setup parser
        html_data = requests.get(base_link)
        html_parser = BeautifulSoup(html_data.text, 'html.parser')

        # Get <a> tags
        a_tags = html_parser.find_all('a')

        # Get href links that start with "http"
        for a_tag in a_tags:
            if len(links) >= num_of_links:
                return links
            elif a_tag.get('href', '') != '' and re.search(
                    "^http", a_tag['href']):
                links.add(a_tag['href'])

        # Update index to check next page
        link_index += 1
        base_link = links[link_index]

    return links
Example #2
0
def loadCache():
    tracksSet = OrderedSet()
    with open("library_cache.json", "r") as f:
        tracks = json.load(f)
        for trackDict in tracks["tracks"]:
            tracksSet.add(Track.fromDict(trackDict))
    return tracksSet
Example #3
0
def find_similar(topic, title, tags):

    print('DEBUG IN FIND SIMILAR: {}, {}, {}\n'.format(topic, title, tags))

    if topic not in valid_sites:
        raise Exception('Unsupported topic')

    method = 'search/advanced'
    SITE = StackAPI(topic, key=APP_KEY, access_token=ACCESS_TOKEN)

    similar = []
    similar += SITE.fetch(
        method, q=title, tags=';'.join(tags), answers=1,
        sort='votes')['items']  # title match and 1+ tags match
    similar += SITE.fetch(method,
                          q=title,
                          answers=1,
                          store_new_question='votes')['items']  # title match
    #similar += SITE.fetch(method, tags=';'.join(tags), answers=1, sort='votes')['items'] # 1+ tags match

    ids = OrderedSet()
    for s in similar:
        ids.add(str(s['question_id']))
    ids = list(ids)[:15]  # Top 15

    print('{} SIMILAR FOUND\n'.format(len(ids)))

    return get_questions_and_answers(topic, ids)
Example #4
0
class Circle(Shape):
    radius = 0

    def __init__(self, midpoint, radius):
        super().__init__()

        self.midpoint = midpoint
        self.radius = int(radius)

    @classmethod
    def from_rect(cls, rect):
        diameter = min(rect.width, rect.height)
        radius = int(diameter/2)
        midpoint = rect.midpoint

        return Circle(midpoint, radius)

    def find_points(self):
        midx, midy = self.midpoint
        self._points = OrderedSet()

        for x in range(-1*self.radius, self.radius+1):
            for y in range(-1*self.radius, self.radius+1):
                if self.contains_point((int(x), int(y))):
                    self._points.add((int(x+midx), int(y+midy)))

    def contains_point(self, p):
        x, y = p
        return (x+0.5)**2 + (y+0.5)**2 <= self.radius**2
def extract_phrases_without_new_chars():
    chars = get_chars('../output/TRADITIONAL_CHARS.TXT')
    phrases = extract_phrases('../ex-src/cj5-ftzk_utf-8.txt')
    phrases_without_new_chars = OrderedSet()
    phrases_with_new_chars = OrderedSet()
    for phrase in phrases:
        has_new = False
        for char in phrase:
            if char not in chars:
                has_new = True
                break
        if has_new:
            phrases_with_new_chars.add(phrase)
        else:
            phrases_without_new_chars.add(phrase)
    print('total phrases = {}'.format(len(phrases)))
    print('phrases without new chars = {}'.format(len(phrases_without_new_chars)))
    print('phrases with new chars = {}'.format(len(phrases_with_new_chars)))
    f = open('../output/TRADITIONAL_PHRASES.TXT', 'w')
    for p in phrases_without_new_chars:
        f.write('%s\n' % p)
    f.close()
    print('Phrases with new chars:')
    for p in phrases_with_new_chars:
        print(p)
Example #6
0
def collect_all_phrases(path):
    chars_no_data = OrderedSet()
    char_phrases = []
    f = open(path)
    lines = f.readlines()
    f.close()
    for line in lines:
        char = line.strip()
        # no data file
        if not os.path.exists('../output/char_data/'+char+'.html'):
            chars_no_data.add(char)
        else: 
            df = open('../output/char_data/'+char+'.html')
            content = df.read()
            df.close()
            if 'html' not in content:
                chars_no_data.add(char)
            else:
                phrases = collect_phrases(content)
                char_phrases.append(json.dumps({'char':char, 'phrases': phrases}, ensure_ascii=False))

    # write chars with pharases
    print('total chars with phrases: {}'.format(len(char_phrases)))
    fo = open('../output/ONLINE_CHAR_PHRASES.TXT', 'a')
    for cp in char_phrases:
        fo.write("%s\n" % cp)
    fo.close()

    # save remaining
    print('characters without data: '.format(len(chars_no_data)))
    for char in chars_no_data:
        print(char)
Example #7
0
class IncludeRequest(Request):
    """ Adds the ability to include webassets bundles on the request.

    If the bundle does not exist, a KeyError will be raised during the
    rendering of the response, after the view has returned.

    Including a bundle multiple times will have the same result as
    including it once.

    The bundles are rendered in the order in which they were included. Bundles
    that are included first, are also rendered first.

    For example:

        @App.html(model=Model)
        def view(self, request):
            request.include('jquery')  # includes the jquery bundle

    """
    def __init__(self, *args, **kwargs):
        super(IncludeRequest, self).__init__(*args, **kwargs)
        self.included_assets = OrderedSet()

    def include(self, resource):
        self.included_assets.add(resource)
Example #8
0
	def remove_stopwords(text):	    
	    clean_text = OrderedSet()
	    for i in text:
	      i = re.sub('\W','',i)
	      if(i not in stop_words):
	          clean_text.add(i)	            
	    return clean_text
Example #9
0
def evidence_writer(filtered_evidence, sentence_id, data_source, resource_v,
                    top_k, predicate, set_up, rule_predicates):
    data_source = data_source + '/' + set_up
    # rule_predicates = get_rule_predicates(data_source, top_k, predicate)
    # print rule_predicates
    item_set = OrderedSet()
    print resource_v, predicate
    for evidence in filtered_evidence:
        if evidence[1] in rule_predicates:
            if evidence[0] == resource_v[0] and evidence[2] == resource_v[
                    1] and evidence[1] == predicate:
                pass
            else:
                try:
                    item_set.add(evidence[1] + '("' + evidence[0] + '","' +
                                 evidence[2] + '").')
                except:
                    pass
    with open(
            'LPmln/' + data_source + '/evidence_' + top_k + '/' + sentence_id +
            predicate + '.txt', 'wb') as csvfile:
        for i in item_set:
            if '*' not in i:
                try:
                    csvfile.write(i + '\n')
                except:
                    pass

    with open('LPmln/' + data_source + '/evidence_'+top_k+'/' + sentence_id + predicate + '.txt', 'r') as f, \
            open('LPmln/' + data_source + '/evidence_'+top_k+'/' + sentence_id + predicate + '_unique.txt', 'wb') as\
                    out_file:
        out_file.writelines(unique_everseen(f))
    remove_file = 'LPmln/' + data_source + '/evidence_' + top_k + '/' + sentence_id + predicate + '.txt'
    os.remove(remove_file)
    return item_set
Example #10
0
 def build_clr_states(self):
     self.canonical_collection = []
     start = OrderedSet(
         [Item(self.grammar.rules[0], 0, set([DomainTag.END_OF_TEXT]))])
     self.canonical_collection.append(State(self.grammar, start))
     i = 0
     while i < len(self.canonical_collection):
         swd = OrderedSet()
         for item in self.canonical_collection[i].items:
             if item.get_current() != None:
                 swd.add(item.get_current())
         for s in swd:
             next_state_items = OrderedSet()
             for item in self.canonical_collection[i].items:
                 if item.get_current() != None and item.get_current() == s:
                     temp = Item(item.rule, item.marker + 1, item.lookahead)
                     next_state_items.add(temp)
             next_state = State(self.grammar, next_state_items)
             exists = False
             for j in range(len(self.canonical_collection)):
                 if self.canonical_collection[j].items == next_state.items:
                     exists = True
                     self.canonical_collection[i].transition[
                         s] = self.canonical_collection[j]
             if not exists:
                 self.canonical_collection.append(next_state)
                 self.canonical_collection[i].transition[s] = next_state
         i += 1
def evidence_writer(evidences, sentence_id, data_source, resource_v, rule_predicates):
    item_set = OrderedSet()
    for evidence in evidences:
        if evidence[1] in rule_predicates:
            if evidence[0] == resource_v[0] and evidence[2] == resource_v[1] and evidence[1] == data_source:
                pass
            else:
                try:
                    if '"' not in evidence[0] and '"' not in evidence[2]:
                        if ':' not in evidence[0] and ':' not in evidence[2]:
                            if '#' not in evidence[0] and '#' not in evidence[2]:
                                item_set.add(evidence[1] + '("' + evidence[0] + '","' + evidence[2] + '").')
                except:
                    pass
    with open('dataset/' + data_source + '/evidence/'+dbpedia + '/' + rule_mining + '/' + str(sentence_id)+'_.txt', 'wb') as csvfile:
        for i in item_set:
            if '*' not in i:
                try:
                    print i
                    csvfile.write(i+'\n')
                except:
                    pass

    with open('dataset/' + data_source + '/evidence/'+dbpedia + '/'+ rule_mining + '/' + str(sentence_id)+'_.txt', 'r') as f, \
            open('dataset/' + data_source + '/evidence/'+ dbpedia + '/' + rule_mining + '/' + str(sentence_id) + '_unique.txt', 'wb') as out_file:
        out_file.writelines(unique_everseen(f))
    remove_file = 'dataset/' + data_source + '/evidence/'+ dbpedia + '/' + rule_mining + '/' + str(sentence_id)+'_.txt'
    os.remove(remove_file)
    return item_set
class ShellCommandSerializer(object, Resetable):
    def __init__(self, name):
        """

        Parameters
        ----------
        name : str
        shell_command_args : OrderedSet<(str, str, str)>
        shell_commands : set<str>
        """
        self.name = name
        self.reset()

    def reset(self):
        self.shell_command_args = OrderedSet()
        self.__uniq_shell_commands = set()

    def get_shell_commands(self):
        return [x[1] for x in self.shell_command_args]

    def add_command(self, id, cmd, prefixes):
        prefixes = tuple(prefixes)
        if cmd not in self.__uniq_shell_commands:
            shell_command_arg = (id, cmd, prefixes)
            self.__uniq_shell_commands.add(cmd)
            self.shell_command_args.add(shell_command_arg)

    def run_commands(self, max_workers=None):
        if max_workers is None:
            max_workers = cpu_count()

        with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(lambda x: singletons.self.shell_helper.run_shell(*x),
                         self.shell_command_args)
Example #13
0
    def wrapper(*args):
        arg = args[0]
        nonlocal deferring # stateful!

        if arg == SAFEWORD:
            deferring = not deferring
            if deferring:
                # nothing else to do this turn
                return

            # we're not deferring and we have stored calls to process
            if not call_queue.empty():
                # input order cannot be guaranteed as we're using multiprocessing
                # single-process input order can be guaranteed
                calls = OrderedSet()
                while not call_queue.empty():
                    calls.add(call_queue.get())
                LOG.info("%s notifications to be sent call", len(calls))
                return [fn(*fnargs) for fnargs in calls]

            else:
                # we're not deferring and we have no calls to process
                # TODO: empty list or None ?
                return

        # store the args if we're deferring and return
        if deferring:
            call_queue.put(args)
            return

        # we're not deferring, call wrapped fn as normal
        return fn(*args)
Example #14
0
class Node:
    def __init__(self, x: int, y: int, width: int):
        self.x = x
        self.y = y
        self.width = width

        self.__neighbors = OrderedSet()
        self.__conn_ins = []
        self.__edge_cost = {}

    def add_edge(self, node: "Node", delay: int = 0,
                 force_connect: bool = False):
        if not force_connect:
            assert self.width == node.width
        if node not in self.__neighbors:
            self.__neighbors.add(node)
            node.__conn_ins.append(self)
            self.__edge_cost[node] = delay

    def remove_edge(self, node: "Node"):
        if node in self.__neighbors:
            self.__edge_cost.pop(node)
            self.__neighbors.remove(node)

            # remove the incoming connections as well
            node.__conn_ins.remove(self)

    def get_edge_cost(self, node: "Node") -> int:
        if node not in self.__edge_cost:
            return MAX_DEFAULT_DELAY
        else:
            return self.__edge_cost[node]

    def get_conn_in(self) -> List["Node"]:
        return self.__conn_ins

    def __iter__(self) -> Iterator["Node"]:
        return iter(self.__neighbors)

    def __len__(self):
        return len(self.__neighbors)

    @abstractmethod
    def __repr__(self):
        pass

    @abstractmethod
    def node_str(self):
        pass

    def clear(self):
        self.__neighbors.clear()
        self.__edge_cost.clear()
        self.__conn_ins.clear()

    def __contains__(self, item):
        return item in self.__neighbors

    def __hash__(self):
        return hash(self.width) ^ hash(self.x) ^ hash(self.y)
Example #15
0
    def get_ignore_types_in_groups(self, ignore_type_in_groups,
                                   ignore_string_type_changes,
                                   ignore_numeric_type_changes,
                                   ignore_type_subclasses):
        if ignore_type_in_groups:
            if isinstance(ignore_type_in_groups[0], type):
                ignore_type_in_groups = [ignore_type_in_groups]
        else:
            ignore_type_in_groups = []

        result = []
        for item_group in ignore_type_in_groups:
            new_item_group = OrderedSet()
            for item in item_group:
                item = type(item) if item is None or not isinstance(
                    item, type) else item
                new_item_group.add(item)
            result.append(new_item_group)
        ignore_type_in_groups = result

        if ignore_string_type_changes and self.strings not in ignore_type_in_groups:
            ignore_type_in_groups.append(OrderedSet(self.strings))

        if ignore_numeric_type_changes and self.numbers not in ignore_type_in_groups:
            ignore_type_in_groups.append(OrderedSet(self.numbers))

        if ignore_type_subclasses:
            ignore_type_in_groups = list(map(tuple, ignore_type_in_groups))

        return ignore_type_in_groups
Example #16
0
def deleteTracks(tracks):
    deletedTracks = OrderedSet()
    if not ("--delete" in sys.argv or "-d" in sys.argv) or len(tracks) == 0:
        return deletedTracks
    print("Will delete " + str(len(tracks)) + " songs from Youtube Music")
    confirmAll = False
    try:
        if sys.stdout.isatty():
            confirmAll = confirm("Confirm all (Y) or one by one (N)?")
        else:
            confirmAll = True
        for track in tracks:
            print("Delete " + str(track.artist) + " - " + str(track.title) +
                  " [" + str(track.album) + "]",
                  end="")
            if confirmAll or confirm("?"):
                if confirmAll:
                    print()
                if track.entityId:
                    ytmusic.delete_upload_entity(track.entityId)
                    deletedTracks.add(track)
                else:
                    print(
                        "No entity id for this. You may want to rebuild cache (-rc)"
                    )
    except:
        pass
    return deletedTracks
class FileScrubber():
    excluded_set = {'[Chorus]', '[Chorus:]'}
    min_string_token_count = 2

    def __init__(self, file_name):
        self.file_name = file_name
        self.lines_set = OrderedSet()

    def scrub_file(self):
        with open(self.file_name, 'r') as file_handler:
            line_count = 0
            for line in file_handler:
                line_count += 1
                if not line or line in self.excluded_set or len(line.split()) < self.min_string_token_count:
                    continue
                line = line.strip(' ')
                if not line.endswith(',\n') and line_count % 3 == 0:
                    line = line.replace('\n', '.\n')
                else:
                    line = line.replace('\n', ' ')

                self.lines_set.add(line)

        with open('../scrubbed_file.txt', 'w') as file_handler:
            for item in self.lines_set:
                file_handler.write('{}'.format(item))
Example #18
0
    def resume(self, run, input):
        """
        Resumes an existing run with new input
        :param run: the previous run state
        :param input: the new input
        :return: the updated run state
        """
        if run.state == RunState.State.COMPLETED:
            raise FlowRunException("Cannot resume a completed run state")

        last_step = run.steps[-1] if len(run.steps) > 0 else None

        # reset steps list so that it doesn't grow forever in a never-ending flow
        run.steps = []

        if last_step:
            current_node = last_step.node  # we're resuming an existing run
        else:
            current_node = run.flow.entry  # we're starting a new run
            if not current_node:
                raise FlowRunException("Flow has no entry point")

        # tracks nodes visited so we can detect loops
        nodes_visited = OrderedSet()

        while current_node:
            # if we're resuming a previously paused step, then use its arrived on value
            if last_step and len(nodes_visited) == 0:
                arrived_on = last_step.arrived_on
            else:
                arrived_on = datetime.datetime.now(tz=pytz.UTC)

            # create new step for this node
            step = Step(current_node, arrived_on)
            run.steps.append(step)

            # should we pause at this node?
            if isinstance(current_node, RuleSet):
                if current_node.is_pause() and (not input or input.consumed):
                    run.state = RunState.State.WAIT_MESSAGE
                    return run

            # check for an non-pausing loop
            if current_node in nodes_visited:
                raise FlowLoopException(nodes_visited)
            else:
                nodes_visited.add(current_node)

            next_node = current_node.visit(self, run, step, input)

            if next_node:
                # if we have a next node, then record leaving this one
                step.left_on = datetime.datetime.now(tz=pytz.UTC)
            else:
                # if not then we've completed this flow
                run.state = RunState.State.COMPLETED

            current_node = next_node

        return run
Example #19
0
    def _order_node_columns(cols: Set) -> OrderedSet:
        """
        Arrange node columns in a defined order.

        Parameters
        ----------
        cols: Set
            A set with elements in any order

        Returns
        -------
        OrderedSet
            A set with elements in a defined order

        """
        node_columns = cols.copy()
        core_columns = OrderedSet([
            "id", "category", "name", "description", "xref", "provided_by",
            "synonym"
        ])
        ordered_columns = OrderedSet()
        for c in core_columns:
            if c in node_columns:
                ordered_columns.add(c)
                node_columns.remove(c)
        internal_columns = set()
        remaining_columns = node_columns.copy()
        for c in node_columns:
            if c.startswith("_"):
                internal_columns.add(c)
                remaining_columns.remove(c)
        ordered_columns.update(sorted(remaining_columns))
        ordered_columns.update(sorted(internal_columns))
        return ordered_columns
Example #20
0
class IncludeRequest(Request):
    """ Adds the ability to include webassets bundles on the request.

    If the bundle does not exist, a KeyError will be raised during the
    rendering of the response, after the view has returned.

    Including a bundle multiple times will have the same result as
    including it once.

    The bundles are rendered in the order in which they were included. Bundles
    that are included first, are also rendered first.

    For example:

        @App.html(model=Model)
        def view(self, request):
            request.include('jquery')  # includes the jquery bundle

    """

    def __init__(self, *args, **kwargs):
        super(IncludeRequest, self).__init__(*args, **kwargs)
        self.included_assets = OrderedSet()

    def include(self, resource):
        self.included_assets.add(resource)
Example #21
0
 def children(self):
     children = OrderedSet()
     for ports in self.wires:
         for port in ports:
             if port.owner() == self:
                 continue
             children.add(port.owner())
     return children
Example #22
0
def get_phrases_from_ext_dict(path):
    phrases = OrderedSet()
    f = open(path)
    lines = f.readlines()
    f.close()
    for line in lines:
        phrases.add(line.strip())
    return phrases
Example #23
0
    def days(self):
        days = OrderedSet()

        for service in self.services.available.values():
            for day in service.days:
                days.add(day)

        return days
Example #24
0
def hole_4d():
    hyperplanes = OrderedSet()
    a1 = np.array([1, 0, 0, 0])
    a2 = np.array([-1, 0, 0, 0])

    a3 = np.array([0, 1, 0, 0])
    a4 = np.array([0, -1, 0, 0])

    a5 = np.array([0, 0, 1, 0])
    a6 = np.array([0, 0, -1, 0])

    a7 = np.array([0, 0, 0, 1])
    a8 = np.array([0, 0, 0, -1])

    p0 = Hyperplane(a5, -1)
    p1 = Hyperplane(a6, 0)
    p2 = Hyperplane(a7, -1)
    p3 = Hyperplane(a8, 0)

    p4 = Hyperplane(a1, 0)
    p5 = Hyperplane(a2, -1)
    p6 = Hyperplane(a3, -1)
    p7 = Hyperplane(a4, 0)

    P1 = set()
    for h in [p0, p1, p2, p3, p4, p5, p6, p7]:
        index = hyperplanes.add(h)
        P1.add((index, -1))

    q1 = Hyperplane(a1, -1)
    q2 = Hyperplane(a2, 0)
    q3 = Hyperplane(a3, 0)
    q4 = Hyperplane(a4, -1)

    P2 = set()
    for h in [p0, p1, p2, p3, q1, q2, q3, q4]:
        index = hyperplanes.add(h)
        P2.add((index, -1))

    r1 = Hyperplane(a1, -2)
    r2 = Hyperplane(a2, 1)
    r3 = Hyperplane(a3, -1)
    r4 = Hyperplane(a4, 0)
    P3 = set()
    for h in [p0, p1, p2, p3, r1, r2, r3, r4]:
        index = hyperplanes.add(h)
        P3.add((index, -1))

    s1 = Hyperplane(a1, -1)
    s2 = Hyperplane(a2, 0)
    s3 = Hyperplane(a3, -2)
    s4 = Hyperplane(a4, 1)
    P4 = set()
    for h in [p0, p1, p2, p3, s1, s2, s3, s4]:
        index = hyperplanes.add(h)
        P4.add((index, -1))

    return Cell_Decomposition(hyperplanes, [P1, P2, P3, P4])
Example #25
0
class Index:
    def __init__(self,
                 simple_url=constants.PYPI_SIMPLE_URL,
                 package_url=constants.PYPI_PACKAGE_URL):
        self.package_url = package_url
        self.simple_url = simple_url
        self._package_names = None

    @property
    def package_names(self):
        if self._package_names is None:
            self._package_names = OrderedSet()
            self.reload()

        return self._package_names

    def _get_html_data(self):
        if self.simple_url.startswith('/'):
            with open(self.simple_url) as fp:
                data = fp.read()
        else:
            response = requests.get(self.simple_url)
            data = response.content

        return data
 
    def _get_names(self):
        data = self._get_html_data()
        soup = BeautifulSoup(data, 'html.parser')
        links = soup.find_all('a')
        names = (link.string for link in links)
        return names

    def _add_package_names(self, names):
        if self._package_names is None:
            self._package_names = OrderedSet()
 
        for name in names:
            self._package_names.add(name)

    def reload(self):
        """
        Reload package names from index.
        """
        names = self._get_names()
        self._add_package_names(names)

    def __len__(self):
        if self._package_names is None:
            return 0
        return len(self.package_names)

    def __iter__(self):
        return (Package(name, self) for name in self.package_names)

    def __repr__(self):
        return "<Index '{}'>".format(self.simple_url)
Example #26
0
class ClassificationResult(ABC):
    """
    The base class for classification problem result.
    """
    def __init__(self):
        self.resultdict = dict()
        self.metric_set = OrderedSet()
        self.label_set = OrderedSet()
        self.confusion_matrices = dict()

    def update_result(self, metric: str, label: str, value: float):
        """
        update the result based on metric name and class label (for each class)
        Args:
            metric (str): metric name, e.g. `accuracy`, `recall`
            label (str): class label name
            value (float): metric value

        Returns:

        """
        if metric not in self.resultdict.keys():
            self.resultdict[metric] = dict()
            self.metric_set.add(metric)
        self.resultdict[metric][label] = value
        self.label_set.add(label)

    @abstractmethod
    def load_results_from_meta(self,
                               evaluation_result: dict,
                               labels: List[str] = None):
        raise NotImplementedError('The derived class should implement it.')

    @abstractmethod
    def convert_metrics_to_table(
            self) -> List[Tuple[str, List[str], List[List[float]]]]:
        """
        converts the metrics saved in the object to a table that is ready to render in the report.
        Returns: a set of tables (title, header, values)
        """
        raise NotImplementedError('The derived class should implement it.')

    def get_metric_list(self):
        """
        returns all the metric names
        Returns: a list of metric names

        """
        return list(self.metric_set)

    def get_label_list(self):
        """
        returns all the class names
        Returns: a list of class label names

        """
        return list(self.label_set)
Example #27
0
 def ask_for_nodes(self):
     nodes_list = requests.get("http://{}/node".format(RAS_IP))
     nodes_list = json.loads(nodes_list.content)
     self.boot_node_list = nodes_list
     m_set = OrderedSet()
     for node in nodes_list:
         m_set.add(node.get('publicKey'))
     m_set = sorted(m_set)
     self.mining_nodes_list = m_set
Example #28
0
class Shape(object):
    def __init__(self):
        self._points = OrderedSet()
        self._outline = OrderedSet()
        self._border = OrderedSet()

        self.dirty = True
        self.midpoint = (0, 0)

    def refresh(self):
        self.find_points()
        self.find_outline()
        self.find_border()

        self.dirty = False

    @property
    def outline(self):
        """The points outside the shape that are adjacent to it"""
        if self.dirty:
            self.refresh()

        return self._outline

    @property
    def border(self):
        """the points inside the shape along the border"""
        if self.dirty:
            self.refresh()

        return self._border

    @property
    def points(self):
        if self.dirty:
            self.refresh()
            self.dirty = False

        return self._points

    def find_points(self):
        raise NotImplementedError()

    def find_outline(self):
        self._outline = OrderedSet()
        for point in self._points:
            for neighbor in neighbors(point):
                if neighbor not in self._points:
                    self._outline.add(neighbor)


    def find_border(self):
        self._border = OrderedSet()
        for point in self._points:
            for neighbor in neighbors(point):
                if neighbor not in self._points:
                    self._border.add(point)
Example #29
0
class Routes:
    def __init__(self):
        self.routes = {}
        self.route_number = 0
        self.route_list = OrderedSet()
        self.route = ''

    @property
    def _get_route_list(self):
        return self.route_list

    @property
    def _get_routes(self):
        return self.routes

    def _add(self, point):
        self.route_list.add(point)
        self.route = '-'.join(self.route_list)

    def _remove(self, point):
        self.route_list.remove(point)

    def _set_route(self):
        def _route_cost(_set, cost=0):
            _set = list(_set)
            for i in range(len(_set) - 1):
                if any(_set[i + 1] == k for k in nodes[_set[i]].keys()):
                    cost += nodes[_set[i]][_set[i + 1]]
            return cost

        if len(self.route) > 1:
            result = self.route
            if result not in self.routes.values():
                self.route_number += 1
                cost = _route_cost(self.route_list)
                self.routes[self.route_number] = result, cost
                self.route_list = self.route_list[:-1]
                self.route = ''

    @staticmethod
    def cheapest_route(routes):
        cost = tuple()
        if routes:
            values = routes.values()
            value_iter = iter(values)
            cost = next(value_iter)
            c = cost[1]
            for i in routes.keys():
                if c > routes[i][1]:
                    c = routes[i][1]
                    cost[0] = routes.get(i)[0]
            return f'cheapest route is: {cost}'
        return f'There is not routes...'

    def __str__(self):
        return str(self.routes)
def extract_new_chars_from_phrases(phrases):
    old_chars = get_chars('../output/TRADITIONAL_CHARS.TXT')
    new_chars = OrderedSet()
    for phrase in phrases:
        for char in phrase:
            if char not in old_chars:
                new_chars.add(char)
    # write new chars
    for c in new_chars:
        print(c)
Example #31
0
def get_phrases_from_hvdict(path):
    phrases = OrderedSet()
    f = open(path)
    lines = f.readlines()
    f.close()
    for line in lines:
        d = json.loads(line.strip())
        for phrase in d['phrases']:
            phrases.add(phrase)
    return phrases
Example #32
0
def get_chars(path):
    # check
    chars = OrderedSet()
    f = open(path)
    lines = f.readlines()
    f.close()
    for line in lines:
        for char in line.strip():
            chars.add(char)
    return chars
Example #33
0
def get_data(path):
    res = OrderedSet()
    f = open(path)
    for line in f:
        if ' ' in line:
            res.add(line.split()[1].strip().replace(',', '').replace('.', ''))
        else:
            res.add(line.strip().replace(',', '').replace('.', ''))
    f.close
    return res
Example #34
0
 def check_time_units(self):
     ret = OrderedSet()
     for name in self.wb.sheetnames:
         if self.is_rnum_sheet(name):
             ret.add(self.get_time_unit(self.wb[name]))
     if len(ret) > 1:
         print("ERROR: Multiple Time unit are preset in " + self.file +
               " " + str(ret))
         return False
     print("Time Unit : " + ''.join(ret))
     return True
Example #35
0
class Crawler():
    def __init__(self, url, depth=25):
        self.crawled_urls = OrderedSet([])
        if (is_url_valid(url)):
            url = get_clean_url(url, '')
            self.depth = depth
            self.index = 0
            self.crawled_urls.add(url)
            self.crawl(url)

    def crawl(self, url):
        '''
        Crawl over URLs
            - scrape for anchor tags with hrefs in a webpage
            - reject if unwanted or cleanup the obtained links
            - append to a set to remove duplicates
            - "crawled_urls" is the repository for crawled URLs
        @input:
            url: URL to be scraped
        '''
        found_urls = []
        try:
            page = urlopen(url)
            content = page.read()
            soup = BeautifulSoup(content, 'lxml', parse_only=SoupStrainer('a'))
            for anchor in soup.find_all('a'):
                link = anchor.get('href')
                if is_url_valid(link):
                    # Complete relative URLs
                    link = get_clean_url(url, link)
                    if is_link_internal(link, url):
                        found_urls.append(link)
                else:
                    pass

        except HTTPError as e:
            print('HTTPError:' + str(e.code) + ' in ', url)
        except URLError as e:
            print('URLError: ' + str(e.reason) + ' in ', url)
        except Exception:
            import traceback
            print('Generic exception: ' + traceback.format_exc() + ' in ', url)

        cleaned_found_urls = set(found_urls)  # To remove repitions
        self.crawled_urls |= cleaned_found_urls  # Union of sets
        if (len(self.crawled_urls) > self.depth):
            self.crawled_urls = self.crawled_urls[:self.depth]
            return
        else:
            self.index += 1
            if self.index < len(self.crawled_urls):
                self.crawl(self.crawled_urls[self.index])
            else:
                return
Example #36
0
def solve(ring, values):
    for value in values:
        soln = OrderedSet([value])
        target = value + ring[0] + ring[1]
        for ridx in range(1, len(ring)):
            diff = target - (ring[ridx] + ring[ridx + 1])
            # TODO: We could short-circuit here if diff is not in
            # values, but I like this flow better
            if diff in values:
                soln.add(diff)
        if len(soln) == len(values):
            return ring, soln
    return None
Example #37
0
def sub_questions(question, flows):
    questions = OrderedSet()
    try:
        qflows = flows.filter(question=question).exclude(next_question=question)
        if qflows:
            for flow in qflows:
                if flow.next_question:
                    questions.add(flow.next_question)
                    subsequent = sub_questions(flow.next_question, flows)
                    map(lambda q: questions.add(q), subsequent)
    except QuestionFlow.DoesNotExist:
        return OrderedSet()
    return questions
Example #38
0
    def add (self, pkgList):
        """Given a list of lines from the input file, strip off any leading
           symbols and add the result to the appropriate list.
        """
        existingExcludedSet = OrderedSet(self.excludedList)
        existingPackageSet = OrderedSet(self.packageList)
        newExcludedSet = OrderedSet()
        newPackageSet = OrderedSet()

        excludedGroupList = []

        for pkg in pkgList:
            stripped = pkg.strip()

            if stripped[0:2] == "@^":
                self.environment = stripped[2:]
            elif stripped[0] == "@":
                self._processGroup(stripped[1:])
            elif stripped[0] == "-":
                if stripped[1:3] == "@^" and self.environment == stripped[3:]:
                    self.environment = None
                elif stripped[1] == "@":
                    excludedGroupList.append(Group(name=stripped[2:]))
                else:
                    newExcludedSet.add(stripped[1:])
            else:
                newPackageSet.add(stripped)

        # Groups have to be excluded in two different ways (note: can't use
        # sets here because we have to store objects):
        excludedGroupNames = [g.name for g in excludedGroupList]

        # First, an excluded group may be cancelling out a previously given
        # one.  This is often the case when using %include.  So there we should
        # just remove the group from the list.
        self.groupList = [g for g in self.groupList if g.name not in excludedGroupNames]

        # Second, the package list could have included globs which are not
        # processed by pykickstart.  In that case we need to preserve a list of
        # excluded groups so whatever tool doing package/group installation can
        # take appropriate action.
        self.excludedGroupList.extend(excludedGroupList)

        existingPackageSet = (existingPackageSet - newExcludedSet) | newPackageSet
        existingExcludedSet = (existingExcludedSet - existingPackageSet) | newExcludedSet

        # FIXME: figure these types out
        self.packageList = list(existingPackageSet)
        self.excludedList = list(existingExcludedSet)
def standardize_vecs(labels, vecs, merge_mode='weighted'):
    standardized_labels = OrderedSet()
    standardized_vecs = []

    for index, (label, vec) in enumerate(zip(labels, vecs)):
        label = standardize(label)

        if merge_mode == 'weighted':
            vec /= (index + 1)

        if label not in standardized_labels:
            standardized_labels.add(label)
            standardized_vecs.append(vec)
        else:
            if merge_mode != 'first':
                index = standardized_labels.index(label)
                standardized_vecs[index] += vec

    return list(standardized_labels), np.array(standardized_vecs)
Example #40
0
    def write_wide_format_otu_table(**kwargs):
        output_table_io = kwargs.pop('output_table_io')
        table_collection = kwargs.pop('table_collection')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if hasattr(output_table_io, 'name'):
            logging.info("Writing %s" % output_table_io.name)
        else:
            logging.info("Writing an OTU table")

        # Collect a hash of sequence to sample to num_seqs
        gene_to_seq_to_sample_to_count = OrderedDict()
        sequence_to_taxonomy = {}
        samples = OrderedSet()
        for otu in table_collection:
            if otu.marker not in gene_to_seq_to_sample_to_count:
                gene_to_seq_to_sample_to_count[otu.marker] = {}
            if otu.sequence not in gene_to_seq_to_sample_to_count[otu.marker]:
                gene_to_seq_to_sample_to_count[otu.marker][otu.sequence] = {}
            if otu.sample_name in gene_to_seq_to_sample_to_count[otu.marker][otu.sequence]:
                raise Exception("Unexpectedly found 2 of the same sequences for the same sample and marker")
            gene_to_seq_to_sample_to_count[otu.marker][otu.sequence][otu.sample_name] = otu.count
            samples.add(otu.sample_name)
            # This isn't perfect, because the same sequence might have
            # different taxonomies in different samples. But taxonomy might
            # be of regular form, or as a diamond example etc, so eh.
            sequence_to_taxonomy[otu.sequence] = otu.taxonomy

        output_table_io.write("\t".join(itertools.chain( # header
            ['marker','sequence'],
            samples,
            ['taxonomy\n'])))
        for gene, seq_to_sample_to_count in gene_to_seq_to_sample_to_count.items():
            for seq, sample_to_count in seq_to_sample_to_count.items():
                row = [gene, seq]
                for sample in samples:
                    try:
                        row.append(str(sample_to_count[sample]))
                    except KeyError:
                        row.append('0')
                row.append(sequence_to_taxonomy[seq])
                output_table_io.write("\t".join(row)+"\n")
Example #41
0
 def all_questions(self):
     """This is might be different from the flow questions because it might have group paramater questions if present
     :return:
     """
     if self.parameter_list:
         questions = OrderedSet(self.parameter_list.parameters)
     else:
         questions = OrderedSet()
     map(lambda q: questions.add(q), self.flow_questions)
     return questions
Example #42
0
class SparseEntryStorage(object):
    """
    Temporarily stores entries of a labeled sparse matrix in an efficient
    format.
    """
    def __init__(self):
        self.reset()

    def reset(self):
        """
        Resets this SparseEntryStorage to being empty.
        """
        self.labels = OrderedSet()
        self.entries = defaultdict(float)

    def add_entry(self, entry):
        """
        Add a single triple of the form (value, row_label, col_label).
        """
        value, row_label, col_label = entry
        key = (self.labels.add(row_label), self.labels.add(col_label))
        self.entries[key] += value

    def add_entries(self, entries):
        """
        Add triples of the form (value, row_label, col_label).
        """
        for value, row_label, col_label in entries:
            key = (self.labels.add(row_label), self.labels.add(col_label))
            self.entries[key] += value

    def labels_and_matrix(self):
        """
        Return the labels and symmetrized sparse matrix.
        """
        # Borrowed from scipy.sparse.dok_matrix.tocoo()
        data = np.asarray(self.entries.values(), dtype='d')
        indices = np.asarray(self.entries.keys(), dtype=np.intc).T
        labels = self.labels

        matrix = coo_matrix((data, indices), shape=(len(labels), len(labels)))
        return labels, matrix + matrix.T
Example #43
0
 def select_averaged_rows(self, row_dict):
     """
     Given a mapping from labels to row-indices, returns a space in which
     the row with a given label is the average of those row-indices.
     """
     labels = OrderedSet()
     new_u = np.zeros((len(row_dict), self.k))
     for label, indices in row_dict.items():
         rownum = labels.add(label)
         old_rows = self.u[indices, :]
         new_u[rownum] = sum(old_rows) / len(old_rows)
     return self.__class__(new_u, self.sigma, labels)
Example #44
0
 def survey_questions(self):
     inline_ques = self.questions_inline()
     questions = OrderedSet(inline_ques)
     survey_questions = OrderedSet()
     other_flows = QuestionFlow.objects.exclude(validation_test__isnull=True,
                                                question__pk__in=[q.pk for q in inline_ques]).exclude(
                                                 next_question__pk__in=[q.pk for q in inline_ques] #skip questions
                                                 )
     for ques in inline_ques:
         survey_questions.append(ques)
         map(lambda q: survey_questions.add(q), sub_questions(ques, other_flows))
     return survey_questions
Example #45
0
    def ask(self, query_symbols, logical_query, coeff_expr=None):
        """
        Builds a pyDataLog program from the logical_query and loads it. Then executes the query for the query_symbols.

        :param query_symbols: The symbols to be queried.
        :type query_symbols: list(SubSymbol)
        :param logical_query:
        :type:
        :return:
        """
        helper_len = 0
        tmp = None
        if not query_symbols:
            return None
        if coeff_expr is None:
            helper_len = len(query_symbols)
            helper_predicate = 'helper(' + ','.join([str(v) for v in query_symbols]) + ')'
            tmp = helper_predicate + " <= " + self.transform_query(logical_query)
        else:
            helper_len = len(query_symbols) + 1
            syms = OrderedSet(query_symbols)
            syms.add('COEFF_EXPR')
            helper_predicate = 'helper(' + ','.join([str(v) for v in syms]) + ')'
            index_query = self.transform_query(logical_query)
            coeff_query = "(COEFF_EXPR == " + str(coeff_expr) + ")"
            if index_query is None:
                tmp = helper_predicate + " <= " + coeff_query
            else:
                tmp = helper_predicate + " <= " + " & ".join([index_query, coeff_query])
        log.debug("pyDatalog query: " + tmp)
        pyDatalog.load(tmp)
        answer = pyDatalog.ask(helper_predicate)
        pyEngine.Pred.reset_clauses(pyEngine.Pred("helper", helper_len))

        if answer is None:
            return []

        return self.transform_answer(answer.answers)
Example #46
0
 def _flow_questions():
     # next line is to normalize to question set. Otherwise it seems to be causing some issues with flows
     # since the flow is more native to Qset. Additional attributes in subclasses are just extras
     qset = QuestionSet.get(id=self.id)
     inline_ques = qset.questions_inline()
     OrderedSet(inline_ques)
     flow_questions = OrderedSet()
     for ques in inline_ques:
         flow_questions.append(ques)
         # boldly assuming subquests dont go
         map(lambda q: flow_questions.add(
             q), ques.direct_sub_questions())
         # more than quest subquestion deep for present implemnt
     return flow_questions
Example #47
0
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True):
    """
    Read a file of tab-separated association data from ConceptNet, such as
    `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations,
    and a pandas Index of labels.

    If you specify `orig_index`, then the index of labels will be pre-populated
    with existing labels, and any new labels will get index numbers that are
    higher than the index numbers the existing labels use. This is important
    for producing a sparse matrix that can be used for retrofitting onto an
    existing dense labeled matrix (see retrofit.py).
    """
    mat = SparseMatrixBuilder()

    labels = OrderedSet(orig_index)

    totals = defaultdict(float)
    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip().split('\t')

            index1 = labels.add(replace_numbers(concept1))
            index2 = labels.add(replace_numbers(concept2))
            value = float(value_str)
            mat[index1, index2] = value
            mat[index2, index1] = value
            totals[index1] += value
            totals[index2] += value

    # Link nodes to their more general versions
    for label in labels:
        prefixes = list(uri_prefixes(label, 3))
        if len(prefixes) >= 2:
            parent_uri = prefixes[-2]
            if parent_uri in labels:
                index1 = labels.index(label)
                index2 = labels.index(parent_uri)
                mat[index1, index2] = 1
                mat[index2, index1] = 1
                totals[index1] += 1
                totals[index2] += 1

    # add self-loops on the diagonal with equal weight to the rest of the row
    if self_loops:
        for key, value in totals.items():
            mat[key, key] = value

    shape = (len(labels), len(labels))
    index = pd.Index(labels)
    return mat.tocsr(shape), index
Example #48
0
class Ellipse(Shape):
    rx = 0
    ry = 0

    def __init__(self, midpoint, rx, ry):
        super(Ellipse, self).__init__()

        self.midpoint = midpoint
        self.rx = int(rx)
        self.ry = int(ry)

    def from_rect(self, rect):
        ry = int(rect.height/2)
        rx = int(rect.width/2)
        midpoint = rect.midpoint

        return Ellipse(midpoint, rx, ry)

    def find_points(self):
        midx, midy = self.midpoint
        self._points = OrderedSet()

        for x in range(-1*self.rx, self.rx+1):
            for y in range(-1*self.ry, self.ry+1):
                if self.contains_point((int(x+midx), int(y+midy))):
                    self._points.add((int(x+midx), int(y+midy)))

    def contains_point(self, p):
        x, y = p
        midx, midy = self.midpoint

        vx = ((float(x) - float(midx))**2 / float(self.rx)**2)
        vy = ((float(y) - float(midy))**2 / float(self.ry)**2)

        v = vx + vy

        return v <= 1.0
Example #49
0
    def ask(self, query_symbols, logical_query, coeff_expr=None):
        """
        Builds a prolog query for a given set of query symbols, a logical query and a coefficient expression

        :param query_symbols: A Set of query (sub)symbols to be queried for
        :param logical_query: The logical query containing constants and presumably the query symbols
        :param coeff_expr: The coefficient expression for the given query
        :return: A list of tuples containg the answers for the query symbols
        """
        if coeff_expr is None:
            lhs_rule = 'helper(' + ','.join([str(v) for v in query_symbols]) + ')'
            rule = lhs_rule + ":-" + self.transform_query(logical_query) + "."
            query = "query(" + lhs_rule + ")."
        else:
            syms = OrderedSet(query_symbols)
            syms.add('COEFF_EXPR')
            lhs_rule = 'helper(' + ','.join([str(v) for v in syms]) + ')'
            index_query = self.transform_query(logical_query)
            coeff_query = "COEFF_EXPR = " + str(coeff_expr) + ""
            query = "query(" + lhs_rule + ")."
            if index_query is None:
                rule = lhs_rule + " :- " + coeff_query + "."
            else:
                rule = lhs_rule + " :- " + " , ".join([index_query, coeff_query]) + "."

        answer = self.execute([rule, query])

        answer_args = []
        for key in answer.keys():
            answer_args.append(key.args)

        # Query yields no result
        if answer.values()[0] == 0.0:
            return []

        return self.transform_answer(answer_args)
Example #50
0
class MasterTransducer(Transducer):
    """
    A collection of transducers. This class is intended to be used as a singleton.
    """

    def __init__(self):
        super().__init__([])
        self.transducers = OrderedDict()
        self.groups = OrderedDict()
        self.selected = OrderedSet()
        self.parser = None

    def add(self, transducer, groups=None):
        """
        Registers a `Transducer`.
        The transducers are guaranteed to execute in the order in which they are added.
        """
        assert isinstance(transducer, Transducer)
        name = transducer.name
        assert name is not None
        assert name not in self.transducers.keys(), 'Duplicit transducer "{0}"'.format(name)
        self.transducers[name] = transducer
        for group in groups:
            assert group in self.groups.values()
            group.add(transducer)

    def add_group(self, name, description=None):
        """
        Creates and registers a transducer group.

        :param name: the name of the group
        :param description: the description of the group
        :return: the constructed group instance
        """
        assert name not in self.groups.keys(), 'Duplicit group "{0}"'.format(name)
        group = TransducerGroup(name, description)
        self.groups[name] = group
        return group

    def add_arguments(self, parser):
        """
        Registers command line arguments that control this `MasterTransducer` in an :py:mod:`ArgumentParser`.

        :param parser: an :py:mod:`ArgumentParser`
        """
        assert isinstance(parser, ArgumentParser)
        self.parser = parser
        group_names = list(self.groups.keys())
        parser.add_argument('--group', '-g', nargs='+', action='append', choices=group_names, metavar='G',
                            help=_(
                                'Enables the transducer group G. Combine with --help to show detailed information. Available groups: {0}').format(
                                ', '.join(group_names)))
        transducer_names = list(self.transducers.keys())
        parser.add_argument('--transducer', '-t', nargs='+', action='append', choices=transducer_names, metavar='T',
                            help=_(
                                'Enables the transducer T. Combine with --help to show detailed information. Available transducers: {0}').format(
                                ', '.join(transducer_names)))

    def configure(self, args, file=sys.stdout):
        """
        Configures this `MasterTransducer` using the arguments parsed by an :py:mod:`ArgumentParser`.

        :param args: command line arguments parsed by an :py:mod:`ArgumentParser`
        :param file: the file to print help string to
        """
        self.selected = OrderedSet()
        if args.group:
            for group_name in chain.from_iterable(args.group):
                group = self.groups[group_name]
                if args.help:
                    self.parser.print_help(file)
                    file.write('\n')
                    group.print_help(file)
                    self.parser.exit()
                for transducer in group.transducers:
                    self.selected.add(transducer)
        if args.transducer:
            for transducer_name in chain.from_iterable(args.transducer):
                transducer = self.transducers[transducer_name]
                if args.help:
                    self.parser.print_help(file)
                    file.write('\n')
                    transducer.print_help(file)
                    self.parser.exit()
                self.selected.add(transducer)
        if len(self.selected) == 0:
            # If no transducer is selected explicitly, all transducers are used.
            self.selected = self.transducers.values()

    @overrides
    def substitute(self, string, indices):
        """
        Translates a string using the selected transducers.
        """
        for transducer in self.selected:
            string, indices = transducer.substitute(string, indices)
        return string, indices
Example #51
0
def test_tuples():
    set1 = OrderedSet()
    tup = ('tuple', 1)
    set1.add(tup)
    assert set1.index(tup) == 0
    assert set1[0] == tup
Example #52
0
class TableShaper(object):

    #-------------------------
    # unfold
    #----------------- 


    def unfold(self,
               in_path_or_2d_array, 
               col_name_to_unfold, 
               col_name_unfold_values, 
               out_method=OutMethod.STDOUT, 
               constant_cols=None, 
               new_col_names_col=None):
        '''
        Unfold (reshape) data frame like the following example:
        
           ======   ========   ============   =========   ======
           userId   question   questionType   timeAdded   answer 
           ======   ========   ============   =========   ======
            10      DOB          pullDown       Jun2010    1983     
            10      gender       radio          May2011      F
            20      DOB          pullDown       Jun2010    1980
            20      gender       radio          May2011      M
                             ...
                             
        Let the unfold column be 'question', and the 'constants'
        columns be 'questionType' and 'timeAdded'. You could call the
        function like this:
        
           unfold('/tmp/in.csv', 
                  col_name_to_unfold='question'
                  col_name_unfold_values='answer'
                  constant_cols=['questionType','timeAdded'])
                  
        The reshaped table looks like this:
        
           ========   ============  =========    ==     ==
           question   questionType  timeAdded    v1     v2   
           ========   ============  =========    ==     ==       
             DOB        pullDown    June2010    1983   1980
           gender        radio       May2011     F       M      
        
    
        Each line is now one question. All answers to one question
        are columns in that question's row. It is an error to have
        inconsistencies in the constants-columns. For instance,
        if the original row "20  DOB   pullDown..." had been
        "20  DOB  radio" an error would have been raised. All constant-col
        field values for the same question (in different rows of the original)
        must match. 
        
        Another way to call the function controls the names of the new
        columns. One column  can be specified to provide the column headers:
        
           unfold('/tmp/in.csv',
                  col_name_to_unfold='question'
                  col_name_unfold_values='answer'
                  constant_cols=['questionType','timeAdded'],
                  new_col_names_col='userId)
                  
        The reshaped table would look like this:
        
           ========   ============  =========    ==     ==
           question   questionType  timeAdded    10     20
           ========   ============  =========    ==     ==       
             DOB        pullDown    June2010    1983   1980
           gender        radio       May2011     F       M      
                  
         
         I.e. the user id values are used as the column headers
         of the new table.
         
         To have the function behave like an iterator
         (each item will be an array with one row of the
          reshaped table):
         
           it = unfold('/tmp/in.csv',
                      col_name_to_unfold='question'
                      col_name_unfold_values='answer'
                      constant_cols=['questionType','timeAdded'],
                      out_method=OutMethod.ITERATOR)
           for row in it:
               print(row)
               
        To write the output to a file:
        
           unfold('/tmp/in.csv',
                  col_name_to_unfold='question'
                  col_name_unfold_values='answer'
                  constant_cols=['questionType','timeAdded'],
                  new_col_names_col='userId,
                  out_method=OutMethod('/tmp/trash.csv')
        
         
        :param in_path_or_2d_array: location of input CSV file, or
            an array of arrays. First row must be column names.
        :type in_path_or_2d_array: {string | [[]]}
        :param col_name_to_unfold: name of the column to unfold into columns
        :type col_name_to_unfold: string
        :param col_name_unfold_values: column name of the unfold values, i.e. the values 
             in rows under the new columns 
        :type col_name_unfold_values: string 
        :param out_method: where to put the output CSV. If omitted,
             new table is written to stdout.
        :type out_method: OutMethod
        :param constant_cols: names of columns that are to be retained 
        :type constant_cols: {None | [string]}
        :param new_col_names_col: name of column to use for column names of new columns
        :type new_col_names_col: {None | string}
        '''
        
        # Error checking and initializations:
        
        if type(col_name_to_unfold) != str:
            raise ValueError('Must name column that is to be unfolded')
        else:
            self.col_name_to_unfold = col_name_to_unfold
        
        if new_col_names_col is not None and type(new_col_names_col) != str:
            raise ValueError('New-column prefix must be a string, was %s' % new_col_names_col)
        self.new_col_names_col = new_col_names_col
        if new_col_names_col is None:
            # No col specified to provide column headers
            # for new columns:
            # String for creating names for the new columns.
            # The string is prefixed to 1,2,3,...: 'v' for 'value':
            self.new_col_prefix = 'v'
        
        if constant_cols is not None:
            if type(constant_cols) != list:
                raise ValueError('Parameter constant_cols must be None or a list of column names.')
            self.constant_cols = constant_cols
        else:
            # constant_cols is None:
            self.constant_cols = []
        
        self.out_method = out_method
        self.col_name_unfold_values = col_name_unfold_values
        
        # Place to accumulated the unfolded values:
        self.unfolded_values_dict = OrderedDict()
        
        # Place to hold the columns that are constant:
        self.const_col_dict = OrderedDict()
        
        # Place to hold names for new columns:
        self.new_col_names = OrderedSet()
        
        try:
            if type(in_path_or_2d_array) == str:
                # Get in-table from a file:
                in_fd = open(in_path_or_2d_array, 'r')
                reader = csv.reader(in_fd, delimiter=',') 
            else:
                # Get in-table from a 2d array:
                reader = iter(in_path_or_2d_array)
                in_fd = None
    
            # Look at in-table's header line and get various
            # constants initialized:
                    
            self.header = self.process_in_header_line(reader) 
            
            # Read the rows and create in-memory representation
            # of transformed structure:
            for row in reader:
                
                # Field value of the unfold-column that is key of rows in new tbl
                # e.g. 'DOB' or 'gender':
                unfold_col_value = row[self.col_indx_to_unfold]
                
                # Encountered thiS key (i.e. unfold-col value) before?
                # If not, init with empty array of that key's value for
                # the subject who is represented by this row.
                # We'll end up with this: {'DOB' : ['1983', '1980'], 'gender' : ['M','F']}:
                collected_values = self.unfolded_values_dict.get(unfold_col_value, [])
                
                # Value of this unfold-key in this row (e.g. '1983' or 'M'):
                unfold_value = row[self.col_indx_of_values]
                collected_values.append(unfold_value)
                self.unfolded_values_dict[unfold_col_value] = collected_values
                
                # Now take care of constant columns.
                # For each unique value of the column that
                # is being unfolded, constant columns must
                # be unique. Example to end up with:
                #
                #    question   questionType   answer1    answer2
                #    --------------------------------------------
                #      DOB       pullDown       1980       1983
                #     gender      radio          F          M
                #
                # Cannot have original table contain 'pullDown' for 
                # some DOB row, and 'radio' for another. This won't
                # work as an original:
                #     subject   question answer  questionType
                #    -----------------------------------------
                #    subject1    DOB      1980   pullDown
                #    subject1   gender     F      radio
                #    subject2    DOB      1983    radio
                #    subject2   gender     M      radio
                # 
    
                for col_num in range(len(row)):
                    try:
                        col_name = self.header[col_num]
                    except IndexError:
                        raise ValueError('Row %s has more columns than header (%s)' % (col_num, self.header))
                    col_value = row[col_num]
                    
                    # Is this column constant for a given pivot column value?
                    if col_name in self.constant_cols:
                        
                        # Dict: 
                        #    {(<unfold-col-value, constant_col_name) : constant_col_value}
                        # I.e. for each of the values in the column to be unfolded,
                        # each constant column has the same value, else something is wrong.
                        # Check whether we already encountered the value in the current
                        # row's unfold-value; if not init, if yes, ensure that this 
                        # constant-col's value in the current row is the same as in 
                        # other rows in which the unfold-value is the same as in this row:
                        
                        const_values_dict_key = (unfold_col_value,col_name)
                        col_constant = self.const_col_dict.get(const_values_dict_key, None)
                        
                        if col_constant is None:
                            self.const_col_dict[const_values_dict_key] = col_value
                        else:
                            # Saw value for this column and pivot value earlier:
                            if col_value != col_constant:
                                raise ValueError("Column that is supposedly constant for a given pivot value is not: %s != %s" %\
                                                 (col_value, col_constant))
                                
                    # Are we to use an existing column as source for
                    # names of new columns?
                    
                    if self.new_col_names_col is not None:
                        self.new_col_names.add(row[self.new_cols_col_indx])
                     
        finally:
            if type(in_path_or_2d_array) == str:
                in_fd.close()
                                    
        return(self.output_result())

    # ---------------------------------- Private Methods ---------------------

    
    #-------------------------
    # create_out_header_row
    #----------------- 
    
    def create_out_header_row(self, header):
        
        # Create CSV: col_name_to_unfold, constant_cols[0], constant_cols[1], ..., unfolded-values-columns
        # Find the longest row of unfolded values, so that we can pad
        # them with zeroes:
        unfolded_max_len = 0
        for unfolded_value in self.unfolded_values_dict.keys():
            num_unfolded_values = len(self.unfolded_values_dict[unfolded_value])
            unfolded_max_len = max(num_unfolded_values, unfolded_max_len) # Header: start with the column name that was unfolded:
        
        header = [self.col_name_to_unfold] 
        # Continue with any columns that were constant for
        # any given unfold-value:
        header.extend(self.constant_cols)
        # Finally: invent names for all the unfolded values
        # that are now columns; or the caller specified a
        # self.new_col_names_col, and we accumulated values
        # from that column-name-providing column in self.new_col_names
        if self.new_col_names_col is not None:
            for new_col_header in self.new_col_names:
                header.append(new_col_header)
        else:
            # Invent names for the new columns: v<n>:
            for indx in range(unfolded_max_len):
                header.append('%s%s' % (self.new_col_prefix, indx))
        
        return (header, unfolded_max_len)

    
    #-------------------------
    # process_in_header_line
    #----------------- 
    
    def process_in_header_line(self, reader):
        '''
        Given a csv- or excel reader that is pointed to
        table file, read the first row, which is expected
        to be the table header. Error-check, and return
        that header. 
        
        :param reader: object providing the file-like API
        :type reader: csv.Reader
        '''
    
        header = reader.next()
        
        # If we are to use the value of a column to name
        # new columns created for the unfolded values,
        # ensure the col exists:
        
        if self.new_col_names_col is not None:
            try:
                self.new_cols_col_indx = header.index(self.new_col_names_col)
            except IndexError:
                raise ValueError('Specified column %s as source of col names for unfolded columns, but no such column exists' % self.new_col_names_col)
        else:
            self.new_cols_col_indx = None
        try:
            # Does the column to be unfolded exist?
            # in the running example: 'question':
            self.col_indx_to_unfold = header.index(self.col_name_to_unfold)
        except IndexError:
            raise ValueError('The column to unfold (%s) does not appear in the table header (%s)' % (self.col_name_to_unfold, header))
        try:
            # Does the column with the unfold-values
            # exist? In the running example: 'answer':
            self.col_indx_of_values = header.index(self.col_name_unfold_values)
        except IndexError:
            raise ValueError('The column of unfold values (%s) does not appear in the table header (%s)' % (self.col_name_unfold_values, header))
        return header
        
    #-------------------------
    # output_result
    #----------------- 
        
    def output_result(self):
        # Do the writing-out, to STDOUT, a file, or
        # by building an internal 2d array of the result
        # and returning an iterator to it:
        try:
            # Will be None if iterator requested:
            (out_fd, writer) = self.make_writer(self.out_method)
            
            (header, unfolded_max_len) = self.create_out_header_row(self.header)
    
            if self.out_method == OutMethod.ITERATOR:
                result = [header]
            else:
                writer.writerow(header)
            # Each new row is about one of the unfolded values,
            # like 'DOB' or 'gender' in the example:
            for unfold_key in self.unfolded_values_dict.keys():
                new_row = [unfold_key]
                # Add constant-column values if any:
                for col_name in self.constant_cols:
                    # The constant-column value for the current
                    # rows value in the column being unfolded is
                    # kept in self.const_col_dict. Keys are tuples:
                    # (unfold_col_value, constant_col_name):
                    const_col_key = (unfold_key, col_name)
                    col_constant = self.const_col_dict[const_col_key]
                    new_row.append(col_constant)
                
                unfolded_values = self.unfolded_values_dict[unfold_key]
                # Fill short-row vectors with zeros:
                unfolded_values = unfolded_values + (unfolded_max_len - len(unfolded_values))*[0]
                new_row.extend(unfolded_values)
                if self.out_method == OutMethod.ITERATOR:
                    result.append(new_row)
                else:
                    writer.writerow(new_row)
        finally:
            if self.out_method == OutMethod.ITERATOR:
                return(iter(result))
            elif self.out_method != OutMethod.STDOUT:
                out_fd.close()

    # ---------------------------------- Support Methods ---------------------
                    
    #-------------------------
    # make_writer
    #----------------- 
            
    def make_writer(self, out_method):
    # Obtain a csv writer object if function is
    # not called as a generator:
        if out_method != OutMethod.ITERATOR and out_method != OutMethod.STDOUT:
            fd = open(out_method.FILE, 'w')
        elif out_method == OutMethod.STDOUT:
            fd = sys.stdout
        else:
            fd = writer = None
        if fd is not None:
            writer = csv.writer(fd)
        return (fd,writer)
Example #53
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_edge_sources = output_dir + '/edge_sources.csv'
    output_node_prefixes = output_dir + '/node_prefixes.csv'
    output_features = output_dir + '/edge_features.csv'

    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()
    seen_prefixes = set()

    edge_file = open(output_edges, 'w', encoding='utf-8')
    edge_source_file = open(output_edge_sources, 'w', encoding='utf-8')
    node_prefix_file = open(output_node_prefixes, 'w', encoding='utf-8')
    feature_file = open(output_features, 'w', encoding='utf-8')

    for assertion in read_msgpack_stream(msgpack_filename):
        if assertion['uri'] in assertion_list:
            continue
        assertion_idx = assertion_list.add(assertion['uri'])
        rel_idx = relation_list.add(assertion['rel'])
        start_idx = node_list.add(assertion['start'])
        end_idx = node_list.add(assertion['end'])

        source_indices = []
        sources = assertion['sources']
        for source in sources:
            for sourceval in sorted(source.values()):
                source_idx = source_list.add(sourceval)
                source_indices.append(source_idx)

        jsondata = json.dumps(assertion, ensure_ascii=False, sort_keys=True)
        weight = assertion['weight']
        write_row(
            edge_file,
            [assertion_idx, assertion['uri'],
             rel_idx, start_idx, end_idx,
             weight, jsondata]
        )
        for node in (assertion['start'], assertion['end'], assertion['dataset']):
            write_prefixes(node_prefix_file, seen_prefixes, node_list, node)
        for source_idx in sorted(set(source_indices)):
            write_row(edge_source_file, [assertion_idx, source_idx])

        if assertion['rel'] in SYMMETRIC_RELATIONS:
            features = [(0, start_idx), (0, end_idx)]
        else:
            features = [(1, start_idx), (-1, end_idx)]

        for direction, node_idx in features:
            write_row(feature_file, [rel_idx, direction, node_idx, assertion_idx])

    edge_file.close()
    edge_source_file.close()
    node_prefix_file.close()
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)
Example #54
0
class ColumnName(object):
    """A ColumnName is a string naming the Column and optionally a set of qualifiers.
    
    In SQL, ColumnName qualifiers are usually table names or table aliases."""

    def __init__(self, name, qualifiers=None, allow_wildcard=False):
        self.original_token = name
        name_parts = name.split('.')
        self.name = name_parts[-1]
        self.is_wildcard = False

        if allow_wildcard and is_wildcard_identifier(self.name):
            self.is_wildcard = True
        elif not is_valid_identifier(self.name):
            raise InvalidColumnNameError(self.name)

        self.qualifiers = OrderedSet(qualifiers or [])
        if len(name_parts) > 1:
            self.qualifiers.add('.'.join(name_parts[:-1]).lower())

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._cased_name = value
        self._name = value.lower()

    @property
    def qualifiers(self):
        return self._qualifiers

    @qualifiers.setter
    def qualifiers(self, value):
        self._qualifiers = OrderedSet([qual.lower() for qual in value])

    def __eq__(self, other):
        if type(other) is type(self):
            return (self.name == other.name and other.qualifiers == self.qualifiers)
        return False
     
    def __gt__(self, other):
        if type(other) is type(self):
            return ((self.name == other.name or self.is_wildcard) and self.qualifiers <= other.qualifiers)
        return False
     
    def __lt__(self, other):
        return other > self

    def __ge__(self, other):
        return (self > other or self == other)

    def __le__(self, other):
        return (self < other or self == other)

    def __ne__(self, other):
        return not self == other

    def __hash__(self):
        return hash(self.name)

    def __str__(self):
        return self._cased_name

    def __repr__(self):
        return '<ColumnName ' + '.'.join([qualifiers_to_str(self.qualifiers), self.name]) + '>'

    def match(self, *right_column_names):
        """Given a list of ColumnNames, return a list of those that match this ColumName.

        This operation is not commutative. That is, A.match(B) =/=> B.match(A)."""
        return [col for col in right_column_names if self >= col]
Example #55
0
	def findSubGraphs(self):
		"""Find equivalent path from two equivalent nodes
		For each node hash it gets all of the BB and try to build path from each pair of them
		The result is put in a dual dictionary that has the starting node hash as the first key, the path hash as the second key and the equivalent pathS as a list of sets(containing nodes) 
		"""
		matchedPathsWithDifferentLengths = 0
		for i in self.M.keys():
			for z in range(0,len(self.M[i])-1):
				for j in self.M[i][z+1:]:							#pick one from the second node onward
					visited1=set()
					visited2=set()
					q1=Queue.Queue()					#add the first and n node to tmp
					q1.put((self.M[i][z],j))
					path1=OrderedSet()
					path2=OrderedSet()
					path1_bis=OrderedSet()
					path2_bis=OrderedSet()
					path1NodeHashes = {}
					path1.add(self.M[i][z])
					path2.add(j)
					path1Str=''
					path2Str=''
					path1NodeHashes[self.M[i][z]]=self.G[(self.M[i][z])].ctx.hash_itype2
					pathHash1= hashlib.sha1()
					while not q1.empty():			                            # for each matching pair from tmp
						x,y = q1.get(block = False)
						tmp_visited2=set()
						for l in self.G[x].succs :						
							matchedbyHash = False
							if (l not in visited1) and (l !=x) and (l not in path1):
								visited1.add(l)
								tmp_visited2Backup=tmp_visited2   
								hashType = 'hash_itype1'
								matchedbyHash, m, tmp_visited2 = self.findMatchInSuccs( l, y, hashType, visited2, tmp_visited2, path2)

								if not matchedbyHash:
									hashType = 'hash_itype2'
									tmp_visited2= tmp_visited2Backup
									matchedbyHash, m, tmp_visited2 = self.findMatchInSuccs( l, y, hashType, visited2, tmp_visited2, path2)
								
								if not matchedbyHash:
									hashType = 'freq'
									tmp_visited2= tmp_visited2Backup
									matchedbyHash, m, tmp_visited2 = self.findMatchInSuccs( l, y, hashType, visited2, tmp_visited2, path2)

								if matchedbyHash:
									path1NodeHashes[l] = self.G[l][hashType]
									path1.add(l)
									path2.add(m)
									q1.put((l,m))
									visited2.add(m)

						visited2.update(tmp_visited2)
					if (len(path1) != len(path2)):
						matchedPathsWithDifferentLengths += 1
					else:
						path1_bis, path2_bis = self.makeSubgraphSingleEntryPoint(path1, path2) 
				
					if len(path1) >1:
						for kk in path1:
							path1Str+=path1NodeHashes[kk]
							
						pathHash1.update(path1Str)
						a=pathHash1.hexdigest()
						if not(self.pathPerNodeHashFull.has_key(i)) or (not( self.pathPerNodeHashFull[i].has_key(a))):
							self.pathPerNodeHashFull[i][a]=[]

						duplicate1 = False
						duplicate2 = False
					
						listPath1 = list(path1)
						listPath2 = list(path2)
						
						for zz in self.pathPerNodeHashFull[i][a]:
							if listPath1 == zz:
								duplicate1 = True
							if listPath2 == zz:
								duplicate2 = True
								
						if not duplicate1:
							self.pathPerNodeHashFull[i][a].append(list(listPath1))
						if not duplicate2:
							self.pathPerNodeHashFull[i][a].append(list(listPath2))

					if len(path1_bis) >1:
						path1Str = ''
						for kk in path1_bis:
							path1Str+=path1NodeHashes[kk]
							
						pathHash1.update(path1Str)
						a=pathHash1.hexdigest()
						if not(self.pathPerNodeHash.has_key(i)) or (not( self.pathPerNodeHash[i].has_key(a))):
							self.pathPerNodeHash[i][a]=[]

						duplicate1 = False
						duplicate2 = False
					
						listPath1 = list(path1_bis)
						listPath2 = list(path2_bis)
						
						for zz in self.pathPerNodeHash[i][a]:
							if listPath1 == zz:
								duplicate1 = True
							if listPath2 == zz:
								duplicate2 = True
								
						if not duplicate1:
							self.pathPerNodeHash[i][a].append(list(listPath1))
						if not duplicate2:
							self.pathPerNodeHash[i][a].append(list(listPath2))					
Example #56
0
class Rectangle(Shape):
    width = 0
    height = 0

    def __init__(self, midpoint, width, height):
        super().__init__()
        self.midpoint = midpoint
        midx, midy = midpoint
        self.width = width
        self.height = height
        ul_x = -1 * int(self.width / 2) + midx
        ul_y = -1 * int(self.height / 2) + midy
        self.ul = (ul_x, ul_y)

    def find_points(self):
        startx, starty = self.ul
        self._points = OrderedSet()

        for x in range(int(startx), int(startx)+self.width):
            for y in range(int(starty), int(starty)+self.height):
                self._points.add((int(x), int(y)))

    def area(self):
        return self.width * self.height

    def grow(self, direction):
        if direction == Direction.north:
            old_x, old_y = self.ul
            self.ul = (old_x, old_y - 1)
            new_x, new_y = self.ul
            self.height += 1
            new_midy = new_y + int(self.height/2)
            self.midpoint = (self.midpoint[0], new_midy)

        elif direction == Direction.west:
            old_x, old_y = self.ul
            self.ul = (old_x - 1, old_y)
            new_x, new_y = self.ul
            self.width += 1
            new_midx = new_x + int(self.width/2)
            self.midpoint = (new_midx, self.midpoint[1])

        elif direction == Direction.south:
            self.height += 1
            x, y = self.ul
            new_midy = y + int(self.height/2)
            self.midpoint = (self.midpoint[0], new_midy)

        elif direction == Direction.east:
            self.width += 1
            x, y = self.ul
            new_midx = x + int(self.width/2)
            self.midpoint = (new_midx, self.midpoint[1])

        self.dirty = True

    def move(self, direction):
        dx, dy = direction
        ul_x, ul_y = self.ul
        mx, my = self.midpoint
        self.ul = ul_x+dx, ul_y+dy
        self.midpoint = mx+dx, my+dy

        self.dirty=True

    def edge(self, direction):
        """points on the border along the <direction> edge"""
        points = set()
        if direction == Direction.north:
            ul_x, y = self.ul
            for x in range(ul_x, ul_x + self.width):
                points.add((x, y - 1))

        elif direction == Direction.south:
            ll_x, ul_y = self.ul
            y = ul_y + self.height - 1
            for x in range(ll_x, ll_x + self.width):
                points.add((x, y + 1))

        elif direction == Direction.east:
            ul_x, ul_y = self.ul
            x = ul_x + self.width - 1
            for y in range(ul_y, ul_y + self.height):
                points.add((x + 1, y))

        elif direction == Direction.west:
            x, ul_y = self.ul
            for y in range(ul_y, ul_y + self.height):
                points.add((x-1, y))

        return points
Example #57
0
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip().split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1), relation), concept2)
                    )
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2), relation), concept1)
                    )
            else:
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1), relation), concept2)
                    )
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2), relation), concept1)
                    )

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
Example #58
0
class Featurizer(object):
	#
	# Converts chorales into a matrix of feature indices. Each vector in a matrix represents a specific beat within
	# a chorale. Note that indices are 1-based to comply with Torch. 
	#

	# Initialize with the number of scores to analyze
	def __init__(self):
		self.percentage_train = 0.8 # percentage of scores to be in the test split
		self.percentage_dev = 0.5 	# percentage of the test set to be used a dev set
		self.data_dir = "raw_data/"
		self.output_dir = "data/"

		# Features
		self.keys = OrderedSet()
		self.modes = OrderedSet()
		self.times = OrderedSet()
		self.beats = OrderedSet()
		self.offsets = OrderedSet()
		self.cadence_dists = OrderedSet()
		self.cadences = OrderedSet()
		self.pitches = OrderedSet()
		self.intervals = OrderedSet()
		self.roots = OrderedSet()
		self.inversions = OrderedSet()
		self.bases = OrderedSet()
		self.altos = OrderedSet()
		self.tenors = OrderedSet()

		# THIS ORDER MATTERS
		self.input_features = [self.keys, self.modes, self.times, self.beats, self.offsets, self.cadence_dists, \
								self.cadences, self.pitches, self.intervals, self.intervals, self.roots, \
								self.bases, self.inversions]
		self.output_features = [self.roots, self.bases, self.inversions, self.altos, self.tenors]

	# Collect all preprocessed scores
	@timing
	def gather_scores(self):
		from os import listdir
		self.original = []
		for f in glob(self.data_dir + "*.xml"):
			score = converter.parse(f)
			if score.parts[0].quarterLength > 300: # removes on excessively long score
				continue
			self.original.append(score)
		print "Gathered %d 4-part chorales." % len(self.original)
		
		return self.original

	# Create X and y matrices of features for each chorale
	@timing
	def analyze(self):
		print "Analyzing..."
		self.analyzed = [] # to save time, we store the related objects to a score for featurizing
		Xvalues, yvalues = [], []

		# Create X and y matrices for each chorale
		for idx, score in enumerate(self.original):
			sys.stdout.write("Analyzing #%d 	\r" % (idx + 1))
			sys.stdout.flush()
			# score-wide features
			S, A, T, B = getNotes(score.parts[0]), getNotes(score.parts[1]), getNotes(score.parts[2]), getNotes(score.parts[3])
			assert len(S) == len(A)
			assert len(A) == len(T)
			assert len(T) == len(B)
			time_sig, key_sig = getTimeSignature(score.parts[0]), getKeySignature(score.parts[0])
			key_obj = getKeyFromSignature(key_sig)
			tonic = key_obj.tonic.midi
			fermata_locations = map(hasFermata, S)

			# Input/target data for each chorale
			Xc, yc = [], []

			# Create X vector and y output
			for index, n in enumerate(S):
				# [0]: Key
				v_key = key_sig.sharps
				self.keys.add(v_key)
				# [1]: Mode
				v_mode = key_sig.mode
				self.modes.add(v_mode)
				# [2]: Time
				v_time = (time_sig.numerator, time_sig.denominator)
				self.times.add(v_time)
				# [3]: Beat strength
				v_beat = n.beatStrength
				self.beats.add(n.beatStrength)
				# [4]: Offset end
				v_off_end = int(math.floor((len(S) - index) / 4.))
				self.offsets.add(v_off_end)
				# [5]: Cadence distance
				v_cadence_dist = 0 if hasFermata(n) else fermata_locations[index:].index(True)
				self.cadence_dists.add(v_cadence_dist)
				# [6]: Is a point of cadence
				v_cadence = 1 if hasFermata(n) else 0
				self.cadences.add(v_cadence)
				# [7]: Soprano pitch (relative to key signature)
				v_pitch = (n.midi - tonic) % 12
				self.pitches.add(v_pitch)
				# [8]: Interval to previous melody note
				v_ibefore = S[index].midi - S[index - 1].midi if index > 0 else 'None'
				self.intervals.add(v_ibefore)
				# [9]: Interval to next melody note
				v_iafter = S[index + 1].midi - S[index].midi if index < len(S) - 1 else 'None'
				self.intervals.add(v_iafter)
				# [10]: root at time t-1
				# [11]: base at time t-1
				# [12]: inversion at time t-1
				timetminus1 = yc[-1] if len(yc) > 0 else ('*padding*', '*padding*', '*padding*')
				v_root_prev = timetminus1[0] # NOTE THE ORDER
				v_base_prev = timetminus1[1]
				v_inv_prev = timetminus1[2]
				
				# Output vector
				# [0]: root
				# [1]: base
				# [2]: inversion
				consonance = [1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0] # see gct module
				v_root, v_inv, v_base = gct.GCT(tonic, consonance, chord.Chord([B[index], T[index], A[index], S[index]]))
				self.roots.add(v_root)
				self.bases.add(v_base)
				self.inversions.add(v_inv)
				# [3]: Alto pitch (relative to key signature)
				v_alto = (A[index].midi - tonic) % 12
				self.altos.add(v_alto)
				# [4]: Tenor pitch (relative to key signature)
				v_tenor = (T[index].midi - tonic) % 12
				self.tenors.add(v_tenor)
 
				# Input vector
				input_vec = [v_key, v_mode, v_time, v_beat, v_off_end, v_cadence_dist, v_cadence, \
							 v_pitch, v_ibefore, v_iafter, v_root_prev, v_base_prev, v_inv_prev]
				output_vec = [v_root, v_base, v_inv, v_alto, v_tenor]

				Xc.append(input_vec)
				yc.append(output_vec)

			self.analyzed.append((Xc, yc, score, idx))
			Xvalues.append(Xc)
			yvalues.append(yc)

		# Add the 'n/a' option
		for feature_space in self.input_features + self.output_features:
			feature_space.add('*padding*')

		freezeObject(Xvalues, 'Xvalues')
		freezeObject(yvalues, 'yvalues')
		freezeObject(self.roots, 'roots')
		freezeObject(self.bases, 'bases')
		freezeObject(self.inversions, 'inversions')
		freezeObject(self.altos, "alto_range")
		freezeObject(self.tenors, "tenor_range")
		freezeObject(self.input_features, "input_features")
		freezeObject(self.output_features, "output_features")
	
	# After calling self.analyze, this converts the X and y matrices to vectors of feature indices
	# As scores are examined, the indices of output chords are generated.
	@timing
	def featurize(self):
		print "Featurizing..."
		self.featurized = []

		# Set the indices
		self.input_indices = []
		max_index = 1
		for feature_space in self.input_features:
			self.input_indices.append((max_index, max_index + len(feature_space) - 1))
			max_index += len(feature_space)

		for Xc, yc, score, idx in self.analyzed:
			Xcf, ycf = [], []
			for vec in Xc:
				fvec = []
				for fidx, feature_space in enumerate(self.input_features):
					f_feature = feature_space.index(vec[fidx])
					fvec.append(f_feature + self.input_indices[fidx][0])
				Xcf.append(fvec)
			for vec in yc:
				fvec = []
				for fidx, feature_space in enumerate(self.output_features):
					fvec.append(feature_space.index(vec[fidx]) + 1)
				ycf.append(fvec)
			self.featurized.append((npy.matrix(Xcf), npy.matrix(ycf), score, idx))


	# Verify that the feature indices are all in the right ranges
	@timing
	def verify(self):
		print "Verifying indices..."
		for Xcf, ycf, score, idx in self.featurized:
			for fidx in range(Xcf.shape[1]):
				assert Xcf[:, fidx].min() >= self.input_indices[fidx][0]
				assert Xcf[:, fidx].max() <= self.input_indices[fidx][1]
				if fidx > 0:
					assert Xcf[:, fidx - 1].max() < Xcf[:, fidx].min() 
			for fidx in range(ycf.shape[1]):
				assert ycf[:, fidx].min() >= 1
				assert ycf[:, fidx].max() <= len(self.output_features[fidx])

	# Split the chorales into training, dev, and test groups
	@timing
	def train_test_split(self):
		self.train, remaining = split(self.featurized, self.percentage_train)
		self.dev, self.test = split(remaining, self.percentage_dev)
		print "Training, dev, and tests sets contain %d, %d, %d chorales, respectively." \
				% (len(self.train), len(self.dev), len(self.test))

	# Create the aggregate datasets
	@timing
	def aggregrate(self):
		stack = lambda x1, x2: npy.vstack((x1, x2))
		self.trainX, self.trainy = [x for (x, y, sc, idx) in self.train], [y for (x, y, sc, idx) in self.train]
		self.devX, self.devy = [x for (x, y, sc, idx) in self.dev], [y for (x, y, sc, idx) in self.dev]
		self.testX, self.testy = [x for (x, y, sc, idx) in self.test], [y for (x, y, sc, idx) in self.test]
		self.trainXall, self.trainyall = reduce(stack, self.trainX), reduce(stack, self.trainy)
		self.devXall, self.devyall = reduce(stack, self.devX), reduce(stack, self.devy)
		self.testXall, self.testyall = reduce(stack, self.testX), reduce(stack, self.testy)
		self.dataXall = stack(stack(self.trainXall, self.devXall), self.testXall)
		self.datayall = stack(stack(self.trainyall, self.devyall), self.testyall)
	# Write 
	@timing
	def write(self):
		print "Writing to %s folder." % self.output_dir
		with h5py.File(self.output_dir + "chorales.hdf5", "w", libver="latest") as f:
			# Write accumulated chorales
			f.create_dataset("Xtrain", self.trainXall.shape, dtype="i", data=self.trainXall)
			f.create_dataset("ytrain", self.trainyall.shape, dtype="i", data=self.trainyall)
			f.create_dataset("Xdev", self.devXall.shape, dtype="i", data=self.devXall)
			f.create_dataset("ydev", self.devyall.shape, dtype="i", data=self.devyall)
			f.create_dataset("Xtest", self.testXall.shape, dtype="i", data=self.testXall)
			f.create_dataset("ytest", self.testyall.shape, dtype="i", data=self.testyall)
			# Write every chorale into train/dev/test sets
			with open('data/chorale_index.txt', 'w') as m:
				m.write("TRAINING SET\n")
				for idx, (X, y) in enumerate(zip(self.trainX, self.trainy)):
					f.create_dataset("train/chorale%d_X" % idx, X.shape, dtype="i", data=X)
					f.create_dataset("train/chorale%d_y" % idx, y.shape, dtype="i", data=y)
					m.write("%d\t %s\n" % (idx, self.train[idx][2].metadata.title))
				m.write("VALIDATION SET\n")
				for idx, (X, y) in enumerate(zip(self.devX, self.devy)):
					f.create_dataset("dev/chorale%d_X" % idx, X.shape, dtype="i", data=X)
					f.create_dataset("dev/chorale%d_y" % idx, y.shape, dtype="i", data=y)
					m.write("%d\t %s\n" % (idx, self.dev[idx][2].metadata.title))
				m.write("TEST SET\n")
				for idx, (X, y) in enumerate(zip(self.testX, self.testy)):
					f.create_dataset("test/chorale%d_X" % idx, X.shape, dtype="i", data=X)
					f.create_dataset("test/chorale%d_y" % idx, y.shape, dtype="i", data=y)
					m.write("%d\t %s\n" % (idx, self.test[idx][2].metadata.title))
			# Write every chorale individually
			for Xcf, ycf, score, idx in self.featurized:
				f.create_dataset("chorale%d_X" % idx, Xcf.shape, dtype="i", data=Xcf)
				f.create_dataset("chorale%d_y" % idx, ycf.shape, dtype="i", data=ycf)

		# Save test scores for future use
		test_scores = [sc for (x, y, sc, idx) in self.test]
		test_dir = '/Users/hzabriskie/Documents/Thesis/thesis/bach_code/data/test_scores'
		if not os.path.exists(test_dir):
			os.makedirs(test_dir)
		for idx, sc in enumerate(test_scores):
			sc.write('musicxml', test_dir + '/' + str(idx) + '.xml')


	def run(self):
		self.gather_scores()
		self.analyze()
		self.featurize()
		self.verify()
		self.train_test_split()
		self.aggregrate()
		self.write()
Example #59
0
class Featurizer(object):
	#
	# Converts chorales into a matrix of feature indices. Each vector in a matrix represents a specific beat within
	# a chorale. Note that indices are 1-based to comply with Torch. 
	#

	# Initialize with the number of scores to analyze
	def __init__(self):
		self.percentage_train = 0.8 # percentage of scores to be in the test split
		self.percentage_dev = 0.5 	# percentage of the test set to be used a dev set
		self.data_dir = "raw_data/"
		self.output_dir = "data/"

		# Features
		self.keys = OrderedSet()
		self.modes = OrderedSet()
		self.times = OrderedSet()
		self.beats = OrderedSet()
		self.offsets = OrderedSet()
		self.cadence_dists = OrderedSet()
		self.cadences = OrderedSet()
		self.pitches = OrderedSet()
		self.intervals = OrderedSet()
		self.numerals = OrderedSet()
		self.inversions = OrderedSet()
		self.altos = OrderedSet()
		self.tenors = OrderedSet()

		# THIS ORDER MATTERS
		self.input_features = [self.keys, self.modes, self.times, self.beats, self.offsets, self.cadence_dists, \
							self.cadences, self.pitches, self.intervals, self.intervals, self.numerals, self.inversions]
		self.output_features = [self.numerals, self.inversions, self.altos, self.tenors]

	# Collect all preprocessed scores
	@timing
	def gather_scores(self):
		from os import listdir
		self.original = []
		for f in glob(self.data_dir + "*.xml"):
			self.original.append(converter.parse(f))
		print "Gathered %d 4-part chorales." % len(self.original)
		
		return self.original

	# Create X and y matrices of features for each chorale
	@timing
	def analyze(self):
		print "Analyzing..."
		self.analyzed = [] # to save time, we store the related objects to a score for featurizing
		Xvalues, yvalues = [], []

		# Create X and y matrices for each chorale
		for idx, score in enumerate(self.original):
			sys.stdout.write("Analyzing #%d 	\r" % (idx + 1))
			sys.stdout.flush()
			# score-wide features
			S, A, T, B = getNotes(score.parts[0]), getNotes(score.parts[1]), getNotes(score.parts[2]), getNotes(score.parts[3])
			assert len(S) == len(A)
			assert len(A) == len(T)
			assert len(T) == len(B)
			time_sig, key_sig = getTimeSignature(score.parts[0]), getKeySignature(score.parts[0])
			key_obj = getKeyFromSignature(key_sig)
			fermata_locations = map(hasFermata, S)

			# Input/target data for each chorale
			Xc, yc = [], []

			# Create X vector and y output
			for index, n in enumerate(S):
				# [0]: Key
				v_key = key_sig.sharps
				self.keys.add(v_key)
				# [1]: Mode
				v_mode = key_sig.mode
				self.modes.add(v_mode)
				# [2]: Time
				v_time = (time_sig.numerator, time_sig.denominator)
				self.times.add(v_time)
				# [3]: Beat strength
				v_beat = n.beatStrength
				self.beats.add(n.beatStrength)
				# [4]: Offset end
				v_off_end = int(math.floor((len(S) - index) / 4.))
				self.offsets.add(v_off_end)
				# [5]: Cadence distance
				v_cadence_dist = 0 if hasFermata(n) else fermata_locations[index:].index(True)
				self.cadence_dists.add(v_cadence_dist)
				# [6]: Is a point of cadence
				v_cadence = 1 if hasFermata(n) else 0
				self.cadences.add(v_cadence)
				# [7]: Pitch
				v_pitch = n.midi
				self.pitches.add(v_pitch)
				# [8]: Interval to previous melody note
				v_ibefore = S[index].midi - S[index - 1].midi if index > 0 else 'None'
				self.intervals.add(v_ibefore)
				# [9]: Interval to next melody note
				v_iafter = S[index + 1].midi - S[index].midi if index < len(S) - 1 else 'None'
				self.intervals.add(v_iafter)
				# [10]: Numeral at time t-1
				# [11]: Inversion at time t-1
				timetminus1 = yc[-1] if len(yc) > 0 else ('None', 'None')
				v_num_prev = timetminus1[0] # Roman numeral
				v_inv_prev = timetminus1[1] # inversion
				# Intentionally not adding this self.numerals and self.inversions
				
				# Output vector
				# [0]: numeral
				# [1]: inversion
				v_num, v_inv = feat_harmony(S[index], A[index], T[index], B[index], key_obj)
				self.numerals.add(v_num)
				self.inversions.add(v_inv)
				# [2]: alto pitch
				v_alto = A[index].midi
				self.altos.add(v_alto)
				# [3]: tenor pitch
				v_tenor = T[index].midi
				self.tenors.add(v_tenor)

				# Input vector
				input_vec = [v_key, v_mode, v_time, v_beat, v_off_end, v_cadence_dist, v_cadence, \
							 v_pitch, v_ibefore, v_iafter, v_num_prev, v_inv_prev]
				output_vec = [v_num, v_inv, v_alto, v_tenor]

				Xc.append(input_vec)
				yc.append(output_vec)

			self.analyzed.append((Xc, yc, score, idx))
			Xvalues.append(Xc)
			yvalues.append(yc)

		# Add the 'n/a' option
		self.numerals.add('None')
		self.inversions.add('None')
		self.intervals.add('None')

		freezeObject(Xvalues, 'Xvalues')
		freezeObject(yvalues, 'yvalues')
	
	# After calling self.analyze, this converts the X and y matrices to vectors of feature indices
	# As scores are examined, the indices of output chords are generated.
	@timing
	def featurize(self):
		print "Featurizing..."
		self.featurized = []

		# Set the indices
		self.input_indices = []
		max_index = 1
		for feature_space in self.input_features:
			self.input_indices.append((max_index, max_index + len(feature_space) - 1))
			max_index += len(feature_space)

		for Xc, yc, score, idx in self.analyzed:
			Xcf, ycf = [], []
			for vec in Xc:
				fvec = []
				for fidx, feature_space in enumerate(self.input_features):
					f_feature = feature_space.index(vec[fidx])
					fvec.append(f_feature + self.input_indices[fidx][0])
				Xcf.append(fvec)
			for vec in yc:
				fvec = []
				for fidx, feature_space in enumerate(self.output_features):
					fvec.append(feature_space.index(vec[fidx]) + 1)
				ycf.append(fvec)
			self.featurized.append((npy.matrix(Xcf), npy.matrix(ycf), score, idx))


	# Verify that the feature indices are all in the right ranges
	@timing
	def verify(self):
		print "Verifying indices..."
		for Xcf, ycf, score, idx in self.featurized:
			for fidx in range(Xcf.shape[1]):
				assert Xcf[:, fidx].min() >= self.input_indices[fidx][0]
				assert Xcf[:, fidx].max() <= self.input_indices[fidx][1]
				if fidx > 0:
					assert Xcf[:, fidx - 1].max() < Xcf[:, fidx].min() 
			for fidx in range(ycf.shape[1]):
				assert ycf[:, fidx].min() >= 1
				assert ycf[:, fidx].max() <= len(self.output_features[fidx])

	# Split the chorales into training, dev, and test groups
	@timing
	def train_test_split(self):
		self.train, remaining = split(self.featurized, self.percentage_train)
		self.dev, self.test = split(remaining, self.percentage_dev)
		print "Training, dev, and tests sets contain %d, %d, %d chorales, respectively." \
				% (len(self.train), len(self.dev), len(self.test))

	# Create the aggregate datasets
	@timing
	def aggregrate(self):
		stack = lambda x1, x2: npy.vstack((x1, x2))
		trainX, trainy = [x for (x, y, sc, idx) in self.train], [y for (x, y, sc, idx) in self.train]
		devX, devy = [x for (x, y, sc, idx) in self.dev], [y for (x, y, sc, idx) in self.dev]
		testX, testy = [x for (x, y, sc, idx) in self.test], [y for (x, y, sc, idx) in self.test]
		self.trainXall, self.trainyall = reduce(stack, trainX), reduce(stack, trainy)
		self.devXall, self.devyall = reduce(stack, devX), reduce(stack, devy)
		self.testXall, self.testyall = reduce(stack, testX), reduce(stack, testy)
		self.dataXall = stack(stack(self.trainXall, self.devXall), self.testXall)
		self.datayall = stack(stack(self.trainyall, self.devyall), self.testyall)

	# Write 
	@timing
	def write(self):
		print "Writing to %s folder." % self.output_dir
		with h5py.File(self.output_dir + "chorales.hdf5", "w", libver="latest") as f:
			f.create_dataset("Xtrain", self.trainXall.shape, dtype="i", data=self.trainXall)
			f.create_dataset("ytrain", self.trainyall.shape, dtype="i", data=self.trainyall)
			f.create_dataset("Xdev", self.devXall.shape, dtype="i", data=self.devXall)
			f.create_dataset("ydev", self.devyall.shape, dtype="i", data=self.devyall)
			f.create_dataset("Xtest", self.testXall.shape, dtype="i", data=self.testXall)
			f.create_dataset("ytest", self.testyall.shape, dtype="i", data=self.testyall)
			with open('data/chorale_index.txt', 'w') as m:
				for Xcf, ycf, score, idx in self.featurized:
					f.create_dataset("chorale%d_X" % idx, Xcf.shape, dtype="i", data=Xcf)
					f.create_dataset("chorale%d_y" % idx, ycf.shape, dtype="i", data=ycf)
					m.write("%d\t %s\n" % (idx, score.metadata.title))

		freezeObject(self.input_features, "input_features")
		freezeObject(self.input_indices, "input_indices")
		freezeObject(self.numerals, "numerals")
		freezeObject(self.inversions, "inversions")
		freezeObject(self.altos, "alto_range")
		freezeObject(self.tenors, "tenor_range")


	def run(self):
		self.gather_scores()
		self.analyze()
		self.featurize()
		self.verify()
		self.train_test_split()
		self.aggregrate()
		self.write()
Example #60
0
def test_tuples():
    set1 = OrderedSet()
    tup = ('tuple', 1)
    set1.add(tup)
    eq_(set1.index(tup), 0)
    eq_(set1[0], tup)