def cleanup_csv(line):
    '''
    Introduced in Version 3.2.4, I wrote this function in response to a need to
    create a decision tree for a very large national econometric database.  The
    fields in the CSV file for this database are allowed to be double quoted and such
    fields may contain commas inside them.  This function also replaces empty fields
    with the generic string 'NA' as a shorthand for "Not Available".  IMPORTANT: This
    function skips over the first field in each record.  It is assumed that the first
    field in each record is an ID number for the record.
    '''
    line = line.translate(bytes.maketrans(b"()[]{}'", b"       ")) \
           if sys.version_info[0] == 3 else line.translate(string.maketrans("()[]{}'", "       "))
    double_quoted = re.findall(r'"[^\"]+"', line[line.find(',') : ])
    for item in double_quoted:
        clean = re.sub(r',', r'', item[1:-1].strip())
        parts = re.split(r'\s+', clean.strip())
        line = str.replace(line, item, '_'.join(parts))
    white_spaced = re.findall(r',\s*[^,]+\s+[^,]+\s*,', line)
    for item in white_spaced:
        if re.match(r',\s+,', item) : continue
        replacement = '_'.join(re.split(r'\s+', item[:-1].strip())) + ','
        line = str.replace(line, item, replacement)
    fields = re.split(r',', line)
    newfields = []
    for field in fields:
        newfield = field.strip()
        if newfield == '':
            newfields.append('NA')
        else:
            newfields.append(newfield)
    line = ','.join(newfields)
    return line
Example #2
0
def summary_up_result(result_file, ignore, row_head, column_mark):
    """
    Use to summary the monitor or other kinds of results. Now it calculates
    the average value for each item in the results. It fits to the records
    that are in matrix form.

    @result_file: files which need to calculate
    @ignore: pattern for the comment in results which need to through away
    @row_head: pattern for the items in row
    @column_mark: pattern for the first line in matrix which used to generate
    the items in column
    Return: A dictionary with the average value of results
    """
    head_flag = False
    result_dict = {}
    column_list = {}
    row_list = []
    fd = open(result_file, "r")
    for eachLine in fd:
        if len(re.findall(ignore, eachLine)) == 0:
            if len(re.findall(column_mark, eachLine)) != 0 and not head_flag:
                column = 0
                _, row, eachLine = re.split(row_head, eachLine)
                for i in re.split("\s+", eachLine):
                    if i:
                        result_dict[i] = {}
                        column_list[column] = i
                        column += 1
                head_flag = True
            elif len(re.findall(column_mark, eachLine)) == 0:
                column = 0
                _, row, eachLine = re.split(row_head, eachLine)
                row_flag = False
                for i in row_list:
                    if row == i:
                        row_flag = True
                if row_flag is False:
                    row_list.append(row)
                    for i in result_dict:
                        result_dict[i][row] = []
                for i in re.split("\s+", eachLine):
                    if i:
                        result_dict[column_list[column]][row].append(i)
                        column += 1
    fd.close()
    # Calculate the average value
    average_list = {}
    for i in column_list:
        average_list[column_list[i]] = {}
        for j in row_list:
            average_list[column_list[i]][j] = {}
            check = result_dict[column_list[i]][j][0]
            if utils_misc.aton(check) or utils_misc.aton(check) == 0.0:
                count = 0
                for k in result_dict[column_list[i]][j]:
                    count += utils_misc.aton(k)
                average_list[column_list[i]][j] = "%.2f" % (count /
                                                            len(result_dict[column_list[i]][j]))

    return average_list
Example #3
0
def hm_tree(p, nwords, split_lvl=0, ctype=''):
  splits = [r';', r',']
  split_lvl = min(split_lvl, len(splits)-1)

  # check for method claim
  if ctype == '':
    if re.search(r'method', p) > 0:
      ctype = 'method'
    else:
      ctype = 'device'
  
  # first split along parent -> [children] line
  if ctype == 'device':
    split_words = r'(compris\w+|has|having|including)'
  else:
    split_words = r'(compris\w+\sthe\ssteps\sof)'
  split_markers = r'.*?(?::|-|\s)(?:\sa\splurality\sof)?'
  parts_rgx = r'^(.*?)' + split_words + split_markers + r'(.*)$'
  parts = re.match(parts_rgx, p)
  if parts:

    # NOTE: could change which words from head chunk are selected here
    parent = get_head_words(parts.group(1), nwords, ctype)

    # then split the [children] array
    children = re.split(splits[split_lvl] + r'(?:\s*and)?', parts.group(3))
    if len(children) == 1:
      children = re.split(r'and', parts.group(3))
    return Tree(parent, merge_trees([hm_tree(child, nwords, split_lvl+1, ctype) for child in children]))
  else:
    
    # try splitting on splitters here
    # NOTE: to do later...? danger of pulling in lots of crap

    return Tree(get_head_words(p, nwords, ctype), [])
def parse_dmapdumpstring(dumpstring):
    scandata = {}
    scan = dumpstring.split('scalars:')[-1].split('arrays:')
    scalars = scan[0].split('\n')
    vectors = re.split(VECTOR_SPLITTER, scan[1])
    for scalar in scalars:
        if scalar == '':
            continue
        assignment = scalar.split('\t')[-1].split(' = ')
        var = assignment[0].lstrip('"').rstrip('"')
        value = eval(assignment[1])
        scandata[var] = value
    for vector in vectors:
        vector = vector.split('=')

        if len(vector) <= 1:
            continue
        var = vector[0].split('"')[1]
        vecvalue = []
        for v in re.split(ELEM_SPLITTER, vector[1]):
            v = v.rstrip(',')
            if v == '':
                continue
            if v == 'inf' or v == 'nan' or v == '-nan':
                v = 'float("NaN")'
            try:
                vecvalue.append(eval(v))
            except:
                print 'error parsing vector'

        scandata[var] = np.array(vecvalue)
    return scandata
Example #5
0
def get_head_words(s, nwords, ctype):
  #print ctype
  #print s
  
  # first limit to before any commas, semicolons; and remove stop list phrases
  s = re.split(r';,', s)[0]
  remove_list = r'(a\splurality\sof\s|at\sleast|composition\sof|the\ssteps\sof|wherein\s*(?:said)?|first|second|third|(?:[a-z]|\d+)?(?:\)|\.))'
  s = re.sub(remove_list, '', s)

  if ctype == 'device':
    
    # get first ~ <JJ>*<NN>+ chunk
    return first_JN_chunk(s, nwords)
  
  elif ctype == 'method':
    
    # first try to split around "method" (for first parent node)
    msplit1 = re.split(r'method\s(of|for|to)', s)
    if len(msplit1) > 1:
      return first_V_chunk(msplit1[2], nwords)
    msplit2 = re.split(r'method', s)
    if len(msplit2) > 1:
      return first_V_chunk(msplit2[0], nwords)

    # else, get first VBG + its subject if possible
    return first_V_chunk(s, nwords)
def _extract_metadata(content):
    tree = etree.fromstring(content)
    ns = {'xhtml': 'http://www.w3.org/1999/xhtml'}
    subject = tree.xpath('//xhtml:title', namespaces=ns)[0].text

    metadata_nodes = tree.xpath('//xhtml:meta', namespaces=ns)
    metadata_nodes = [n for n in metadata_nodes if 'name' in n.attrib]
    metadata = {}
    for node in metadata_nodes:
        metadata[node.attrib['name']] = node.attrib['content']

    for n in metadata_nodes:
        n.getparent().remove(n)

    content = etree.tostring(tree, pretty_print=True, encoding=unicode)

    sender = metadata.get('mail-sender', u'')
    to_recipients_txt = metadata.get('mail-to-recipients', u'')
    cc_recipients_txt = metadata.get('mail-cc-recipients', u'')
    bcc_recipients_txt = metadata.get('mail-bcc-recipients', u'')
    to_recipients = filter(None, re.split(r'\s*,\s*', to_recipients_txt))
    cc_recipients = filter(None, re.split(r'\s*,\s*', cc_recipients_txt))
    bcc_recipients = filter(None, re.split(r'\s*,\s*', bcc_recipients_txt))

    return content, subject, sender, to_recipients, cc_recipients, bcc_recipients
Example #7
0
    def __init__(self, filename, myopen=open, swapYZ=False):
        super(MeshPLY,self).__init__()

        with myopen(filename, "r") as f:
             assert f.readline().strip() == "ply"
             assert f.readline().strip().startswith("format ascii")
             elementCounts = []
             while True:
                 line = f.readline().strip()
                 if line == "end_header":
                     break
                 args = re.split("\\s+",line)
                 if len(args) >= 3 and args[0] == 'element':
                     elementCounts.append((args[1],int(args[2])))
             assert len(elementCounts) >= 2
             for element,count in elementCounts:
                 for i in range(count):
                     line = f.readline().strip()
                     if element == 'vertex':
                         args = re.split("\\s+",line)
                         if swapYZ:
                             v = V3(float(args[0]),float(args[2]),-float(args[1]))
                         else:
                             v = V3(float(args[0]),float(args[1]),float(args[2]))
                         self.vertices.append(v)
                     elif element == 'face':
                         args = re.split("\\s+",line)
                         count = int(args.pop(0))
                         v = tuple(int(args[j]) for j in range(count))
                         self.faces.append((0,v))

        assert self.vertices
        assert self.faces
Example #8
0
def parse_range_string(input_lines):
    ip_range_list = []

    ip_lines_list = re.split("\r|\n", input_lines)
    for raw_line in ip_lines_list:
        raw_s = raw_line.split("#")
        context_line = raw_s[0]

        context_line = context_line.replace(' ', '')

        ips = re.split(",|\|", context_line)
        for line in ips:
            if len(line) == 0:
                #print "non line:", line
                continue
            begin, end = ip_utils.split_ip(line)
            if ip_utils.check_ip_valid(begin) == 0 or ip_utils.check_ip_valid(end) == 0:
                print("ip format is error,line:%s, begin: %s,end: %s" % (line, begin, end))
                continue
            nbegin = ip_utils.ip_string_to_num(begin)
            nend = ip_utils.ip_string_to_num(end)
            ip_range_list.append([nbegin,nend])
            #print begin, end

    ip_range_list.sort()

    return ip_range_list
Example #9
0
def tokenize(lines):
  tokens = []
  strings = []
  functions = {}
  new_lines = ''
  for i, line in enumerate(lines):
    line = re.sub(r'#.*$', "", line)
    line = re.sub('\n', ' ', line)
    line = re.sub('\t', '', line)
    line = re.split('\'', line)
    for j, c in enumerate(line):
      if j % 2 == 0:
        new_lines += c
      else:
        strings.append(c)
        new_lines += 'string ' + str(len(strings) - 1)
  new_lines = re.split(';', new_lines)
  for i, token in enumerate(new_lines):
    if token != '' and token != ' ' and token != '\t':
      token = token.strip()
      token = re.split(' ', token)
      if i % 2 != 0:
        functions[token[0]] = token[1:]
      else:
        tokens += token
  tokens = substitute_tokens(tokens)
  return [tokens, strings, functions]
Example #10
0
 def save_config(self, data):
     """Save changes to the configuration table."""
     cursor = self.db_conn.cursor()
     cursor.execute('''INSERT INTO configs VALUES (?, ?, ?, ?, ?, ?)''',
                    (data[0], data[1], data[2], data[3], data[6], data[7],))
     self.db_conn.commit()
     cursor.close()
     if type(data[4]) is str:
         channels = re.split(',? ', data[4])
     else:
         channels = data[4]
     if type(data[5]) is str:
         botops = re.split(',? ', data[5])
     else:
         botops = data[5]
     cursor = self.db_conn.cursor()
     if channels != ['']:
         for chan in channels:
             if chan[0] != "#":
                 chan = "#" + chan
             cursor.execute('''INSERT INTO channels VALUES (NULL, ?, 0, ?)''', (chan, data[0]))
     self.db_conn.commit()
     cursor.close()
     cursor = self.db_conn.cursor()
     if botops != ['']:
         for op in botops:
             cursor.execute('''INSERT INTO users VALUES (NULL, ?, NULL, NULL, 1, ?)''',
                            (op, data[0]))
     self.db_conn.commit()
     cursor.close()
def scrape_and_look_for_next_link(url):      
    html = scraperwiki.scrape(url)
    #print html
    root = lxml.html.fromstring(html)
    soup = BeautifulSoup(html)                        #using BeautifulSoup to find next page links
    scrape_table(root)                                     #before carrying on scrape the hrefs using the scrape_table function
    #print soup
    
    items = soup.findAll('a',title="Next page")           # findAll "next page" links        
    if items:                                             # if there is a next page link continue
        
        next_link = root.cssselect("div.srch-Page.srch-Page-bg a")
    #print next_link
        if next_link:
            next_link2 = next_link[2].attrib['href']
            #print next_link2
            split_link = re.split("\)+",next_link2)
            split_link2 = re.split("\=+",split_link[0])
            split_link3 = re.split("\'+",split_link2[2])
            #print split_link3[0]
        #print split_link2
        #if split_link ==11:
            next_url = nextlink_url+split_link3[0]
            if next_url:
                print next_url
                scrape_and_look_for_next_link(next_url)
Example #12
0
 def create_new(self):
     """Create a new configuration."""
     verify = ''
     while verify != 'y':
         print('\n')
         name = ""
         while name == "":
             name = input("Unique name for this configuration: ")
             cursor = self.db_conn.cursor()
             cursor.execute('''SELECT * FROM configs WHERE name = ?''', (name,))
             data = cursor.fetchone()
             cursor.close()
             if data:
                 print('The name "{0}" is not unique.'.format(name))
                 name = ""
         nick = self.prompt("Nick", "GorillaBot")
         realname = self.prompt("Ident", "GorillaBot")
         ident = self.prompt("Realname", "GorillaBot")
         chans = self.prompt("Channel(s)")
         botop = self.prompt("Bot operator(s)", '')
         password = self.prompt("Server password (optional)", hidden=True)
         youtube = self.prompt("YouTube API key (optional)", hidden=True)
         chans = re.split(',? ', chans)
         botop = re.split(',? ', botop)
         self.display((name, nick, realname, ident, password, youtube), chans, botop)
         verify = input('Is this configuration correct? [y/n]: ').lower()
     self.save_config((name, nick, realname, ident, chans, botop, password, youtube))
     return name
Example #13
0
 def verify(self, data, chans, botops):
     """Verify a configuration, and make changes if needed."""
     verify = input('Is this configuration correct? [y/n]: ').lower()
     if verify == 'y':
         return
     else:
         verify = ''
         while verify != 'y':
             print('\n')
             name = data[0]
             nick = self.prompt("Nick", data[1])
             realname = self.prompt("Ident", data[2])
             ident = self.prompt("Realname", data[3])
             chans = self.prompt("Chans", ", ".join(chans))
             botop = self.prompt("Bot operator(s)", ", ".join(botops))
             password = self.prompt("Server password (optional)", hidden=True)
             youtube = self.prompt("YouTube API key (optional)", hidden=True)
             chans = re.split(',? ', chans)
             botop = re.split(',? ', botop)
             self.display((name, nick, realname, ident, password, youtube), chans, botop)
             verify = input('Is this configuration correct? [y/n]: ').lower()
         self.delete(name)
         cursor = self.db_conn.cursor()
         cursor.execute('''DELETE FROM channels WHERE config = ?''', (name,))
         cursor.execute('''DELETE FROM users WHERE config = ?''', (name,))
         self.db_conn.commit()
         cursor.close()
         self.save_config((name, nick, realname, ident, chans, botop, password, youtube))
Example #14
0
def _parse_meta(fname):
    """Get the metadata as a dict out of the mitGCM mds .meta file."""

    flds = {}
    basename = re.match("(^.+?)\..+", os.path.basename(fname)).groups()[0]
    flds["basename"] = basename
    with open(fname) as f:
        text = f.read()
    # split into items
    for item in re.split(";", text):
        # remove whitespace at beginning
        item = re.sub("^\s+", "", item)
        # match = re.match('(\w+) = ', item)
        match = re.match("(\w+) = (\[|\{)(.*)(\]|\})", item, re.DOTALL)
        if match:
            key, _, value, _ = match.groups()
            # remove more whitespace
            value = re.sub("^\s+", "", value)
            value = re.sub("\s+$", "", value)
            # print key,':', value
            flds[key] = value
    # now check the needed things are there
    needed_keys = ["dimList", "nDims", "nrecords", "dataprec"]
    for k in needed_keys:
        assert k in flds
    # transform datatypes
    flds["nDims"] = int(flds["nDims"])
    flds["nrecords"] = int(flds["nrecords"])
    # use big endian always
    flds["dataprec"] = np.dtype(re.sub("'", "", flds["dataprec"])).newbyteorder(">")
    flds["dimList"] = [[int(h) for h in re.split(",", g)] for g in re.split(",\n", flds["dimList"])]
    if "fldList" in flds:
        flds["fldList"] = [re.match("'*(\w+)", g).groups()[0] for g in re.split("'\s+'", flds["fldList"])]
        assert flds["nrecords"] == len(flds["fldList"])
    return flds
Example #15
0
def update_index_html(dest_dir, sectnum):
  # Process index.html separately from the modules files
  with open(dest_dir + 'index.html', 'r') as index_html_file:
    index_html = index_html_file.readlines()

  for line_num, line in enumerate(index_html):
    #inject css rule to remove haiku's orange bullets
    if '</head>' in line:
      index_html[line_num] = line.replace('</head>','<style>\nul li {\n\tbackground: none;\n\tlist-style-type: none;\n}\n</style>\n</head>')
    elif 'class="section"' in line:
      sectnum += 1
    elif 'RegisterBook' in line:
      #remove registerbook page from TOC
      index_html[line_num] = ''
    elif 'hide-from-toc' in line:
      #remove stub chapter title 
      if '<h1>' in index_html[line_num-1]:
        index_html[line_num-1] = ''
    elif 'class="toctree-l' in line and 'Gradebook' not in line and 'TODO List' not in line:
      title = re.split('>', re.split('</a>', line, re.IGNORECASE)[0], re.IGNORECASE)[-1]
      new_title = '%s.' % sectnum + title
      index_html[line_num] = line.replace(title, new_title)

  # Write the modified contents back to index.html
  with open(dest_dir + 'index.html', 'wb') as index_html_file:
    index_html_file.writelines(index_html)
Example #16
0
 def _parse_taxon_from_line(self, line, line_index):
     if self.strict:
         seq_label = line[:10].strip()
         line = line[10:]
     else:
         if self.multispace_delimiter:
             parts = re.split('[ \t]{2,}', line, maxsplit=1)
         else:
             parts = re.split('[ \t]{1,}', line, maxsplit=1)
         seq_label = parts[0]
         if len(parts) < 2:
             line = ''
         else:
             line = parts[1]
     seq_label = seq_label.strip()
     if not seq_label:
         raise self._data_parse_error("Expecting taxon label", line_index=line_index)
     if self.underscores_to_spaces:
         seq_label = seq_label.replace('_', ' ')
     current_taxon = self.char_matrix.taxon_set.require_taxon(label=seq_label)
     if current_taxon not in self.char_matrix:
         self.char_matrix[current_taxon] = dataobject.CharacterDataVector(taxon=current_taxon)
     else:
         if len(self.char_matrix[current_taxon]) >= self.nchar:
             raise self._data_parse_error("Cannot add characters to sequence for taxon '%s': already has declared number of characters (%d)" \
                     % (current_taxon.label, self.char_matrix[current_taxon]), line_index=line_index)
     return current_taxon, line
Example #17
0
def compare_time(start, end):
	"""
	<Purpose>
		Manually compares two times.
        Returns True if end time is more recent than start time.
		Returns False otherwise.
    
    <Arguments>
		start time
		end time
    
    <Exceptions>
        None
    
    <Returns>
        Bool
    """
	s = re.split('-|\+|:| ', start)
	e = re.split('-|\+|:| ', end)
	if s[0] > e[0]:
		return False
		if s[1] > e[1]:
			return False
			if s[2] > e[2]:
				return False
				if s[3] > e[3]:
					return False
					if s[4] > e[4]:
						return False
						if s[5] > e[5]:
							return False
	return True
Example #18
0
    def __call__(self, source, output_file, contexts_path):
        self.output = output_file

        blocks = re.split("^((?:<<<<<<<|>>>>>>>)[^\n]*\n)", source, flags=re.MULTILINE)

        in_conflict = False

        for index, block in enumerate(blocks):
            if (index & 1) == 0:
                if in_conflict:
                    blocks = re.split("^(=======[^\n]*\n)", block, flags=re.MULTILINE)
                else:
                    blocks = [block]

                for index, block in enumerate(blocks):
                    if (index & 1) == 0:
                        if block:
                            for token, value in self.lexer.get_tokens(block):
                                self.highlightToken(token, value)
                    else:
                        assert block[0] == "="
                        self.output.write(htmlutils.htmlify(block))
            else:
                assert block[0] == "<" or block[0] == ">"
                self.output.write(htmlutils.htmlify(block))
                in_conflict = block[0] == "<"
Example #19
0
def addresses(filename, with_subnetsize=None):
    """find ip addresses configured on all interfaces from filename and return
    dict with interface=>(ip=>address, ipv6=>address)"""
    parseresult = filterConfig(filename, "interface",
                               "^interface|^ip address|^ipv6 address")
    ret = dict()
    for sec in parseresult:
        intret = ""
        for line in sec:
            reobj = re.match("interface (.*)", line)
            if reobj:
                intret = reobj.group(1)
            if intret:
                # FIXME: exclude interfaces with shutdown configured
                reobj = re.match("(ip|ipv6) address (.*)", line)
                if reobj:
                    afi = reobj.group(1)
                    if afi == "ip" and with_subnetsize:
                        ip = reobj.group(2).split(" ")[0]
                        if ipaddr.IPAddress(ip).version is not 4:
                            continue
                        hostmask = reobj.group(2).split(" ")[1]
                        address = str(ipaddr.IPv4Network(ip + "/" + hostmask))
                    elif afi == "ipv6" and with_subnetsize:
                        address = re.split('[ ]', reobj.group(2))[0]
                    else:
                        address = re.split('[\/ ]', reobj.group(2))[0]
                    if not intret in ret:
                        ret[intret] = dict()
                    ret[intret].update({afi: address})
    return ret
def main():
    parser = ArgumentParser(description="", formatter_class=RawDescriptionHelpFormatter, add_help=True)

    parser.add_argument("--data-directory", dest="data_directory", default=None, help="path to directory containing the source instance data to use")
    parser.add_argument("--output-directory", dest="output_directory", default=None, help="path to directory for all of the output instance data")
    parser.add_argument("--duplicates-file", dest="duplicates_file", default=None, help="path to file containing list of duplicate instance data, rows of <shasum> <count> <instance1> <instance2> ...")

    args = parser.parse_args()

    # organize all of the duplicate information
    # rows are in the format <shasum> <instance count> <instance 1> <instance 2>

    instances_to_duplicates = {}
    instance_keys_to_paths = {}
    with open(args.duplicates_file, 'r') as f:
        for line in f:
            line = line.lstrip().rstrip()

            components = line.split(' ')

            shasum = components[0]
            count = int(components[1])
            instances = components[2:]
            instance_keys = [re.split('\.[a-z]+$', os.path.basename(x))[0] for x in instances]

            for key,path in zip(instance_keys, instances):
                instance_keys_to_paths[key] = path

            for key in instance_keys:
                remaining = list(instance_keys)
                remaining.remove(key)

                instances_to_duplicates[key] = remaining

    for instance_data in os.listdir(args.data_directory):
        instance_components = re.split('\.([a-z]+)$', instance_data)
        instance_key = instance_components[0]
        instance_extension = instance_components[1]

        # copy the instance data, then copy it to its duplicate keys if needed
        instance_path = "{}/{}".format(args.data_directory, instance_data)

        shutil.copy(instance_path, args.output_directory)
        if instance_key in instances_to_duplicates:
            for dupe in instances_to_duplicates[instance_key]:
                dupe_filename = "{}.{}".format(dupe, instance_extension)

                source = instance_keys_to_paths[instance_key]
                dest = instance_keys_to_paths[dupe]
                prefix = os.path.commonprefix([source, dest])

                source_suffix = source.replace(prefix, '')
                dest_suffix = dest.replace(prefix, '')

                # modify the content to contain the right file.
                with open(instance_path, 'r') as source_file:
                    with open("{}/{}".format(args.output_directory, dupe_filename), 'w') as dest_file:
                        for line in source_file:
                            modified_line = line.rstrip().replace(source_suffix, dest_suffix)
                            print(modified_line, file=dest_file)
    def __init__(self, jsFileString, settings, tabCharacter):
        self.jsFileString = jsFileString
        self.settings = settings
        self.tabCharacter = tabCharacter
        pattern = r'(define|require)\s*\(\s*\[(.*?)\]\s*?,\s*?function\s*?\((.*?)\)'
        self.requireMatch = re.search(pattern, jsFileString,
                                 flags=re.MULTILINE | re.DOTALL)
        if (self.requireMatch != None
            and len(self.requireMatch.groups()) == self.NUM_GROUPS
            ):

            def removeQuotes(s):
                return s.replace('"', '').replace("'", "")
            pathsGroupString = str(self.requireMatch.group(self.PATHS_GROUP))
            pathsGroupString = pathsGroupString.strip(' \t\n')
            splitPaths = re.split('[\s\n]*,[\s\n]*', pathsGroupString)
            self.paths = list(map(removeQuotes, splitPaths))

            self.args = re.split('[\s\n]*,[\s\n]*',
                                 str(self.requireMatch.group(self.ARGS_GROUP)).strip(' \t\n'))

            if len(self.paths) > 0 and len(self.paths[0]) == 0:
                self.paths = []
            if len(self.args) > 0 and len(self.args[0]) == 0:
                self.args = []
        else:
            self.path = None
            self.args = None
Example #22
0
def cmpAlphaNum(str1,str2):
   str1=str1.lower()
   str2=str2.lower()
   ReSplit='(\d+)'
   str1=re.split(ReSplit,str1)
   str2=re.split(ReSplit,str2)
   if( ''==str1[0] ):
      str1.remove('')
   if( ''==str1[len(str1)-1] ):
      str1.remove('')
   if( ''==str2[0] ):
      str2.remove('')
   if( ''==str2[len(str2)-1] ):
      str2.remove('')
   for i in range( min( len(str1),len(str2) ) ):
      try:
         tmp=int(str1[i])
         str1[i]=tmp
      except:ValueError
      try:
         tmp=int(str2[i])
         str2[i]=tmp
      except:ValueError
      if( str1[i]==str2[i] ):
         continue
      if (str1[i]>str2[i]):
         return 1
      else:
         return -1
   return cmp(len(str1),len(str2))
Example #23
0
 def extractValues (self, line):
     parts = re.split(':', line)
     raw_values = re.split(',', parts[1])
     values = []
     for rv in raw_values:
         values.append(self.cleanValue(rv))
     return values
Example #24
0
    def epg_list(self):
        try:
            now = datetime.datetime.now()
            now = '%04d' % now.year + '%02d' % now.month + '%02d' % now.day + '%02d' % now.hour + '%02d' % now.minute + '%02d' % now.second

            file = open(addonEPG,'r')
            read = file.read()
            file.close()
            programmes = re.compile('(<programme.+?</programme>)').findall(read)
        except:
            return

        for programme in programmes:
            try:
                start = re.compile('start="(.+?)"').findall(programme)[0]
                start = re.split('\s+', start)[0]

                stop = re.compile('stop="(.+?)"').findall(programme)[0]
                stop = re.split('\s+', stop)[0]
                if not int(start) <= int(now) <= int(stop): raise Exception()

                channel = common.parseDOM(programme, "programme", ret="channel")[0]

                title = common.parseDOM(programme, "title")[0]
                title = common.replaceHTMLCodes(title).encode('utf-8')

                desc = common.parseDOM(programme, "desc")[0]
                desc = common.replaceHTMLCodes(desc).encode('utf-8')

                epg = "[B][%s] - %s[/B]\n%s" % ('ÔÙÑÁ'.decode('iso-8859-7').encode('utf-8'), title, desc)

                self.epg.update({channel: epg})
            except:
                pass
Example #25
0
 def __init__(self, gtf_line):
     self.gtf_list = gtf_line
     self.seqname, self.source, self.feature, self.start, self.end, self.score, self.strand, self.frame, self.attribute = gtf_line  # These indexes are defined by the GTF spec
     tmp = map(lambda x: re.split('\s+', x.replace('"', '')),
               re.split('\s*;\s*', self.attribute.strip().strip(';')))
     self.attribute = dict([x for x in tmp if len(x)==2])  # convert attrs to dict
     self.start, self.end = int(self.start) - 1, int(self.end)
def GetRestaurantGrid(d, zip):
    br.select_form("Form1")
    br.set_all_readonly(False)
    dt = 'dgResults$ctl' + str(d) + '$ctl00'
#    print dt

    br["__EVENTTARGET"] = dt
    br["__EVENTARGUMENT"] = ''
    request = br.click()
    response1 = br1.open(request)
    
    # find the window open hidden in the script
    html1 = response1.read()
#    print html1
    root1 = lxml.html.fromstring(html1)
    rest_name = root1.cssselect("span#lblName")[0].text
    rest_address = root1.cssselect("span#lblAddress")[0].text
    cityStateZip = root1.cssselect("span#lblCityStateZip")[0].text
    city = re.split(",", cityStateZip)[0]
    rest_inspectionDate = root1.cssselect("span#lblLastInspection")[0].text
    if rest_inspectionDate == " ":
        date = ""
    else:
        date = re.split(":", rest_inspectionDate)[1].strip()
    violations = parseViolations(html1)
#    print violations

    scraperwiki.sqlite.save(unique_keys=["dt"], data={"dt": dt + "_" + zip + "_" + str(datetime.date.today()), "name": rest_name, "address": rest_address, "city": city, "state":"NY", "zip": zip, "inspection_date": date, "violations": violations, "time_scraped":datetime.datetime.now(), "page_id" : dt})
Example #27
0
def ReadCropAttrs(cropFile):
    if not os.path.exists(cropFile):
        cropFile = TXT_DB_DIR + os.sep + CROP_FILE

    f = open(cropFile)
    lines = f.readlines()
    f.close()

    attrDic = {}

    fields = [item.replace('"', '')
              for item in re.split('\t|\n', lines[0]) if item is not '']
    n = len(fields)

    for i in xrange(n):
        attrDic[fields[i]] = {}

    for line in lines[2:]:
        items = [item.replace('"', '')
                 for item in re.split('\t', line) if item is not '']
        id = int(items[0])

        for i in xrange(n):
            dic = attrDic[fields[i]]
            try:
                dic[id] = float(items[i])
            except:
                dic[id] = items[i]

    return attrDic
Example #28
0
def find_time_interval(fits):
    """
    find time interval of the fits file
    input:  fits            --- fits file name
    output: [tmin, tmax]    --- start and stop time in seconds from 1998.1.1
    """
    cmd = 'dmstat "' + fits + '[cols time]" centroid=no >' + zspace
    scf.run_ascds(cmd)

    out = scf.read_file(zspace, remove=1)

    chk = 0
    for val in out:
        mc1 = re.search('min', val)
        mc2 = re.search('max', val)

        if mc1 is not None:
            atemp = re.split('\s+', val)
            tmin  = int(float(atemp[1]))
            chk  += 1

        elif mc2 is not None:
            atemp = re.split('\s+', val)
            tmax  = int(float(atemp[1]))
            chk  += 1

        if chk > 1:
            break

    return [tmin, tmax]
Example #29
0
def split_string_with_lines(string, indentation = "", chars_per_line = 100):
    # expert splitting mode
    matches = re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', string)
    splitted = []
    for s in matches:
      splitted.append( s + ("," if s != matches[-1] else ""))

    res = []
    buf = ""
    for s in splitted:
      if len(s) > chars_per_line:
        splitted2 = re.split(''' (?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', s)
        for s2 in splitted2:
          ext = s2
          if s2 == splitted2[-2]:
            if len(splitted2[-1]) <= 5:
              ext += " " + splitted2[-1]

          buf += ext + (" " if s2 != splitted2[-1] and ext == s2 else "")

          if len(buf) >= chars_per_line or s2 == splitted2[-1]:
            res.append(buf)
            buf = ""
            if ext != s2:
              break
      else:
        buf += s

        if len(buf) >= chars_per_line or s == splitted[-1]:
          res.append(buf)
          buf = ""
    return ("\n%s"%indentation).join( res ), len( res )
def parse_relationship(expression):
    """
    Parse a relationship expression containing a package name and (optionally)
    a version relation of the form ``python (>= 2.6)``. Raises
    :py:exc:`ValueError` when parsing fails. An example:

    >>> from deb_pkg_tools.deps import parse_relationship
    >>> parse_relationship('python')
    Relationship(name='python')
    >>> parse_relationship('python (<< 3)')
    VersionedRelationship(name='python', operator='<<', version='3')

    :param expression: A relationship expression (a string).
    :returns: A :py:class:`Relationship` object.
    """
    tokens = [t.strip() for t in re.split('[()]', expression) if t and not t.isspace()]
    if len(tokens) == 1:
        # Just a package name (no version information).
        return Relationship(tokens[0])
    elif len(tokens) != 2:
        # Encountered something unexpected!
        msg = "Corrupt package relationship expression: Splitting name from relationship resulted in more than two tokens! (expression: %r, tokens: %r)"
        raise ValueError(msg % (expression, tokens))
    else:
        # Package name followed by relationship to specific version(s) of package.
        name, relationship = tokens
        tokens = [t.strip() for t in re.split('([<>=]+)', relationship) if t and not t.isspace()]
        if len(tokens) != 2:
            # Encountered something unexpected!
            msg = "Corrupt package relationship expression: Splitting operator from version resulted in more than two tokens! (expression: %r, tokens: %r)"
            raise ValueError(msg % (relationship, tokens))
        return VersionedRelationship(name, *tokens)
Example #31
0
import pickle
import re
# import jieba
# import jieba.analyse
import operator

data = []
with open("./newsData.pkl", 'rb') as fr:
    data = pickle.load(fr)

likes = dict()
posts = dict()
for i in range(len(data)):
    authors = re.split('、|,' ,data[i][1].replace(" ",""))
    likesCnt = data[i][2]
    for author in authors:
        try:
            likes[author] += int(likesCnt)
            posts[author] += 1
        except:
            likes[author] = int(likesCnt)
            posts[author] = 1

authorAvgLikes = dict()
for author in likes:
    authorAvgLikes[author] = int(likes[author]/posts[author])
print(sorted(authorAvgLikes.items(), key=operator.itemgetter(1)))
# allTag = []
# for i in range(1, len(data)):
    # allTag += jieba.analyse.extract_tags(data[i][1],topK=3)
            dfs_2(each_vertex)
            if counter > max_scc[4]:
                max_scc[4] = counter
                max_scc.sort(reverse = True)

def dfs_2(vertex):
    global counter
    leaders[vertex] = current_source
    counter += 1
    for each_vertex in adjacent_list[vertex]:
        if leaders[each_vertex] < 0:
            dfs_2(each_vertex)

input_file = open("SCC.txt")
for each_line in input_file:
    vertex1, vertex2 = re.split('[ \t\n\r]', each_line.strip())
    vertex1 = int(vertex1) - 1
    vertex2 = int(vertex2) - 1
    reversed_adjacent_list[vertex2].append(vertex1)
input_file.close()
dfs_loop_1()
del reversed_adjacent_list
input_file = open("SCC.txt")
for each_line in input_file:
    vertex1, vertex2 = re.split('[ \t\n\r]', each_line.strip())
    vertex1 = int(vertex1) - 1
    vertex2 = int(vertex2) - 1
    adjacent_list[vertex1].append(vertex2)
input_file.close()
leaders = [-1 for i in range(MAX)]
dfs_loop_2()
Example #33
0
print(l);
print(list("%d" % x for x in range(1, 10)));
print("Hi {0}, 成绩提高了{1:.1f}%".format("小明", 1.254));
print("Hi {0}, 成绩提高了{1}%".format("小明", 1.254));
print("Hi {0}, 成绩提高了{1}%".format("小明", "%.1f"%1.254));
print("=".join(["cdsac","cdsa","dewqd"]));

# ==== 正则表达式 ====
email_re = "^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)+$";
if re.match(email_re, "hujiangyx163.com"):
    print("ok");
else:
    print("error");
# ---- 切分字符串 ----
print("a b c".split(" "));
print(re.split(r'\s+',"a b c"));
print(re.split(r"[\s\,\;]+", "a,b;; c   d"));
# ---- 分组 ----
match = re.match(r'^(\d{3})-(\d{3,8})$', "020-123456")
print(match.group());
print(match.group(0));
print(match.group(1));
print(match.group(2));
new_line = r'截至9月2日0时,全省累计报告新型冠状病毒肺炎确诊病例653例(其中境外输入112例),' \
    r'累计治愈出院626例,死亡3例,目前在院隔离治疗24例,964人尚在接受医学观察';
new_line_re = r'^截至9月2日0时,全省累计报告新型冠状病毒肺炎确诊病例(\d+)例\(其中境外输入(\d+)例\),' \
    r'累计治愈出院(\d+)例,死亡(\d+)例,目前在院隔离治疗(\d+)例,(\d+)人尚在接受医学观察$';
new_line_math = re.match(new_line_re, new_line);
print(new_line_math.group(0));
print(new_line_math.group(1));
print(new_line_math.group(2));
Example #34
0
 def analizar(self):
     self.texto = ""
     arr = re.split("\n", self.fila.texto)
     for (linea) in arr:
         self.texto += mejorar_links(linea)
Example #35
0
        return self._stemmer.stem(word).lower()


grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('lie')

wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]

raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""

import re
re.split(r' ', raw)
re.split(r'[ \t\n]+', raw)
re.split(r'\s+', raw)
re.split(r'\W+', raw)
re.findall(r'\w+|\S\w*', raw)

text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)([A-Z]\.)+| \w+(-\w+)*| \$?\d+(\.\d+)?%?| \.\.\.| [][.,;"'?():-_`]'''

nltk.regexp_tokenize(text, pattern)

fdist = nltk.FreqDist(
    ['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
    print(word, ":", fdist[word], end='; ')
Example #36
0
 def dir_path_handle(path_string, name_handle_func):
     name_list = re.split(r'[\\/]', path_string)
     crypto_list = [name_handle_func(s) for s in name_list]
     return '/'.join(crypto_list)
Example #37
0
    def get_active_zone_set(self):
        """Return the active zone configuration.

        Return active zoneset from fabric. When none of the configurations
        are active then it will return empty map.

        :returns: Map -- active zone set map in the following format

        .. code-block:: python

            {
                'zones':
                    {'openstack50060b0000c26604201900051ee8e329':
                        ['50060b0000c26604', '201900051ee8e329']
                    },
                'active_zone_config': 'OpenStack_Cfg'
            }
        """
        zone_set = {}
        zone = {}
        zone_member = None
        zone_name = None
        switch_data = None
        zone_set_name = None
        try:
            switch_data = self._get_switch_info(
                [zone_constant.GET_ACTIVE_ZONE_CFG])
        except exception.BrocadeZoningCliException:
            with excutils.save_and_reraise_exception():
                LOG.error(
                    _LE("Failed getting active zone set "
                        "from fabric %s"), self.switch_ip)
        try:
            for line in switch_data:
                line_split = re.split('\\t', line)
                if len(line_split) > 2:
                    line_split = [x.replace('\n', '') for x in line_split]
                    line_split = [x.replace(' ', '') for x in line_split]
                    if zone_constant.CFG_ZONESET in line_split:
                        zone_set_name = line_split[1]
                        continue
                    if line_split[1]:
                        zone_name = line_split[1]
                        zone[zone_name] = list()
                    if line_split[2]:
                        zone_member = line_split[2]
                        zone_member_list = zone.get(zone_name)
                        zone_member_list.append(zone_member)
            zone_set[zone_constant.CFG_ZONES] = zone
            zone_set[zone_constant.ACTIVE_ZONE_CONFIG] = zone_set_name
        except Exception:
            # In case of parsing error here, it should be malformed cli output.
            msg = _("Malformed zone configuration: (switch=%(switch)s "
                    "zone_config=%(zone_config)s).") % {
                        'switch': self.switch_ip,
                        'zone_config': switch_data
                    }
            LOG.exception(msg)
            raise exception.FCZoneDriverException(reason=msg)
        switch_data = None
        return zone_set
Example #38
0
    def __init__(self,
                 raw_string,
                 split_expression=r'\W+',
                 bow=True,
                 entity=[]):
        """Initializer.

        Args:
            raw_string: string with raw text in it
            split_expression: string will be split by this.
            bow: if True, a word is the same everywhere in the text - i.e. we
                 will index multiple occurrences of the same word. If False,
                 order matters, so that the same word will have different ids
                 according to position.
            entity: list with the indices of the entity for which the
                explanations are required. Used to perturb the entity always
                as a group and don't break it apart.
        """
        split_expression_non_vocab = r'\W+'  # added for the conll data set
        self.raw = raw_string
        self.as_list = re.split(r'(%s)|$' % split_expression, self.raw)
        self.as_np = np.array(self.as_list)
        non_word = re.compile(r'(%s)|$' % split_expression_non_vocab).match
        self.string_start = np.hstack(
            ([0], np.cumsum([len(x) for x in self.as_np[:-1]])))
        vocab = {}
        self.inverse_vocab = []
        self.positions = []
        self.bow = bow
        non_vocab = set()
        for i, word in enumerate(self.as_np):
            if word in non_vocab:
                continue
            if non_word(word):
                non_vocab.add(word)
                continue
            if bow:
                if word not in vocab:
                    vocab[word] = len(vocab)
                    self.inverse_vocab.append(word)
                    self.positions.append([])
                idx_word = vocab[word]
                self.positions[idx_word].append(i)
            else:
                self.inverse_vocab.append(word)
                self.positions.append(i)
        if not bow:
            self.positions = np.array(self.positions)

        # Get new indices for the entity (if there is one)
        if entity:
            # Calculate indices with respect to as_list encoding (only works for space as the split_expression)
            if split_expression == ' ':
                self.entity_as_list = [ele * 2 for ele in entity]
            else:
                print(
                    'Need to split the example per space (set split_expression == '
                    ' (in lime_text.py).')
                return
            # Calculate indices with respect to the vocab encoding (the one LIME calculates by removing nonvocab tokens)
            self.entity_as_vocab = []
            for ele in self.entity_as_list:
                # Need to use np since positions is a numpy array
                idx_array = np.where(
                    self.positions == ele)  # returns a tuple (array, dtype)
                if idx_array[0].size != 0:
                    self.entity_as_vocab.append(idx_array[0].item())
                else:
                    print(
                        'Problem finding indices of the entities (np.where in lime_text.py).'
                    )
                    print('For raw string ' + raw_string)
                    return
Example #39
0
        row_count += 1

print 'Total Tickets to be checked: ', row_count

for i in range(2, row_count + 2):
    DL = ws.cell(row=i, column=22).value
    Wo_Ref = ws.cell(row=i, column=1).value
    print 'WO Ref taken from the corrective task excel', Wo_Ref
    Yesterday_Comment = []
    DL1 = DL.encode("utf-8")
    #print type (DL1)
    #print DL1
    #print len(DL1.splitlines())
    #pattern=re.compile(r'\d+/\d+/\d+\s\d+:\d+:\d+\s[A-Z]{2}')
    pattern = re.compile(r'\d+:\d+:\d+\s[A-Z]{2}')
    DL_Filtered = re.split(pattern, DL1)
    #print DL_Filtered
    for j in DL_Filtered:
        if date1 in j:
            Yesterday_Comment.append(j.lower())
            #print 'The Yesterday_Comment List: ',Yesterday_Comment

    #print Yesterday_Comment
    #print 'Length: ',len(Yesterday_Comment)

    Yesterday_Comment_Rev = list(Yesterday_Comment)
    Yesterday_Comment_Rev.reverse()
    #Day_DL=' '

    #for i in Yesterday_Comment:
    #   Day_DL=Day_DL+str(i)
Example #40
0
def split_term_classifiers(line):
    # type: (unicode) -> List[Union[unicode, None]]
    # split line into a term and classifiers. if no classifier, None is used..
    parts = re.split(' +: +', line) + [None]
    return parts
Example #41
0
import re
text = open("hindi_file.txt",encoding= 'utf-8').read()
#This converts the encoded text to an internal unicode object, where
# all characters are properly recognized as an entity:
words = re.split(r'\s+', re.sub(r'[,/\-!?.|lIред"\]\[<>br]', ' ', text).strip())
print(words)
text = ' '.join(words)
fh = open("hindi.txt","w", encoding='utf-8').write(text)
#rename the tip names in a Newick tree based on a tab-delimited file linking name codes to full names
from ete3 import Tree
import re, sys

#arg1 - tree file
#arg2 - names mapping file
#arg3 - output file with renamed Newick tree

names = {}

tree = Tree(sys.argv[1])  #Newick tree

#read in a tab-delimited species names file
tblfile = open(sys.argv[2])
for line in tblfile:
    fields = re.split("\t", line.rstrip())
    names[fields[0]] = fields[1]

for leaf in tree:
    if leaf.name in names:
        leaf.name = names[leaf.name]

tree.write(outfile=sys.argv[3])
Example #43
0
def load_instance(path_to_file):

    ### load raw text ###
    f = open(path_to_file, 'r')
    raw_lines = f.read().splitlines()
    stripped_lines = []
    for line in raw_lines:
        line = line.replace('\t', ',').replace('[', '').replace(']', '')
        stripped_lines.append(re.split(',', line))

    ### first line ###
    first_line = stripped_lines[0]
    n_activities = int(
        first_line[0])  # number of activities (incl. dummy activities)
    n_resources = int(first_line[1])

    ### load tasks (main body of instance file) ###
    tasks = {}
    for activity in range(n_activities):
        ### first block ###
        line1 = stripped_lines[activity + 1]
        task_id = int(line1[0])
        n_successors = [
            int(line1[2]),
            int(line1[3]),
            int(line1[4]),
            int(line1[5])
        ]  # [# SS successors, # SF successors, # FS successors, # FF successors]
        successors = [[] for i in range(4)]
        k = 0  # counter to track where in line1 to get desired info.
        for i in range(4):
            if n_successors[i] > 0:
                for j in range(n_successors[i]):
                    successors[i].append(
                        (int(line1[6 + 2 * k + j]),
                         int(line1[6 + 2 * k + n_successors[i] + j]))
                    )  # e.g. successor[i=2(FS)] = [(FS successor id, min. time-lag),...]
                k += n_successors[i]
        ### second block ###
        line2 = stripped_lines[n_activities + activity + 1]
        k = int(line2[2])  # principle resource index
        w_k = int(line2[3])  # principle resource work-content
        q_min = []  # min. per-period resource allocation for each resource
        q_max = []  # max. per-period resource allocation for each resource
        for r in range(n_resources):
            q_min.append(int(line2[4 + 2 * r]))
            q_max.append(int(line2[4 + 2 * r + 1]))
        ### create task ###
        task = Task(task_id, successors, k, w_k, q_min, q_max)
        tasks[task_id] = task

    ### last line ###
    last_line = stripped_lines[2 * n_activities + 1]
    R_max = []  # resource_availabilities
    for r in range(n_resources):
        R_max.append(int(last_line[r]))
    l_min = int(last_line[n_resources])  # min. block length
    ### create project ###
    name = os.path.splitext(os.path.basename(
        os.path.normpath(path_to_file)))[0]
    project = Project(name, tasks, R_max, l_min)
    return (project)
Example #44
0
 def get_elements(self):
     buff = self.buff.replace("\n", " ")
     # multi split elements: ".", ",", ":"
     import re
     for i in re.split('; |, |-|\.|\?|:', buff):
         yield i
Example #45
0
     m = typedef2_pat.match(line)
     if m:
         mode = IN_ENUM
         decls = {}
         idx   = 0
 elif mode == FOUND_ENUM:
     m = openbrace_pat.match(line)
     if m:
         mode  = IN_ENUM
         decls = {}
         idx   = 0
     else:
         assert False, "Invalid z3_api.h, line: %s" % linenum
 else:
     assert mode == IN_ENUM
     words = re.split('[^\-a-zA-Z0-9_]+', line)
     m = closebrace_pat.match(line)
     if m:
         name = words[1]
         z3consts.write('# enum %s\n' % name)
         for k, i in decls.iteritems():
             z3consts.write('%s = %s\n' % (k, i))
         z3consts.write('\n')
         mode = SEARCHING
     else:
         if words[2] != '':
             if len(words[2]) > 1 and words[2][1] == 'x':
                 idx = int(words[2], 16)
             else:
                 idx = int(words[2])
         decls[words[1]] = idx
def word_count(phrase):
    counter = Counter()
    for word in split('\W+', phrase.lower()):
        if word == '' : continue
        counter[word] += 1
    return counter
Example #47
0
def gen(lang):
    global include, INCL_DIR
    print('Generating bindings for', lang)
    templ = template[lang]
    print('Generating bindings for', lang)
    for target in include:
        prefix = templ[target]
        outfile = open(templ['out_file'] % (prefix),
                       'wb')  # open as binary prevents windows newlines
        outfile.write((templ['header'] % (prefix)).encode("utf-8"))

        lines = open(INCL_DIR + target).readlines()

        count = 0
        for line in lines:
            line = line.strip()

            if line.startswith(MARKUP):  # markup for comments
                outfile.write(("\n%s%s%s\n" %(templ['comment_open'], \
                                              line.replace(MARKUP, ''), \
                                              templ['comment_close']) ).encode("utf-8"))
                continue

            if line == '' or line.startswith('//'):
                continue

            if line.startswith('#define '):
                line = line[8:]  #cut off define
                xline = re.split('\s+', line, 1)  #split to at most 2 express
                if len(xline) != 2:
                    continue
                if '(' in xline[0] or ')' in xline[
                        0]:  #does it look like a function
                    continue
                xline.insert(
                    1, '=')  # insert an = so the expression below can parse it
                line = ' '.join(xline)

            if not line.startswith(prefix.upper()):
                continue

            tmp = line.strip().split(',')
            for t in tmp:
                t = t.strip()
                if not t or t.startswith('//'): continue
                # hacky: remove type cast (uint64_t)
                t = t.replace('(uint64_t)', '')
                t = re.sub(r'\((\d+)ULL << (\d+)\)', r'\1 << \2',
                           t)  # (1ULL<<1) to 1 << 1
                f = re.split('\s+', t)

                if f[0].startswith(prefix.upper()):
                    if len(f) > 1 and f[1] not in ('//', '///<', '='):
                        print("Error: Unable to convert %s" % f)
                        continue
                    elif len(f) > 1 and f[1] == '=':
                        rhs = ''.join(f[2:])
                    else:
                        rhs = str(count)
                        count += 1

                    try:
                        count = int(rhs) + 1
                        if (count == 1):
                            outfile.write(("\n").encode("utf-8"))
                    except ValueError:
                        if lang == 'ocaml':
                            # ocaml uses lsl for '<<', lor for '|'
                            rhs = rhs.replace('<<', ' lsl ')
                            rhs = rhs.replace('|', ' lor ')
                            # ocaml variable has _ as prefix
                            if rhs[0].isalpha():
                                rhs = '_' + rhs

                    outfile.write((templ['line_format'] %
                                   (f[0].strip(), rhs)).encode("utf-8"))

        outfile.write((templ['footer']).encode("utf-8"))
        outfile.close()
Example #48
0
# from sys import stdin
# inFile = stdin.readlines
import re
inFile = open('input.txt', 'r', encoding='utf8')
lines = str(inFile.readlines())
# words = lines.replace(';', ' ').split()
# print(words)
newLines = re.split(r', |_|-|!', lines)
# print(newLines)
myList = set()
# count = 0
for elem in newLines:
    myList.add(elem)
print(myList)

punctuation = ['.', ',', ':', ';', '!', '?', '(', ')']
wordList = lines.split()

inFile.close()


# re.split('(\W+)', 'Words, words, words.')
def change_dir(command):
    current_path = os.getcwd()                    # get the current directory
    command, desired_dir = re.split(" ", command) # get the command and the desired dir
    os.chdir(desired_dir)
Example #50
0
def natural_keys(text):
    return [atoi(c) for c in re.split('(\d+)', text)]
# Read policy from file
policy = [];
line = f.readline();
while line:
    line = line[0:-1];
    policy.append(int(line));
    line = f.readline();

# Get standardized ghost locations
f = open("ghostLocs.txt","r");
line = f.readline();
all_ghost_x = [];
all_ghost_y = [];
while line:
    line = line[0:-1];
    ghostx0, ghostx1, ghosty0, ghosty1 = re.split(' ',line);
    ghostLocX = [int(ghostx0),int(ghostx1)];
    ghostLocY = [int(ghosty0), int(ghosty1)];
    all_ghost_x.append(ghostLocX);
    all_ghost_y.append(ghostLocY);
    line = f.readline();

###############################################################################
############################  Game Simulation #################################
###############################################################################
pacman_x,pacman_y = 1,3;
num_ghosts = 2;
#ghost_x = [2,3];
#ghost_y = [2,3];

win_count = 0
Example #52
0
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [tryint(c) for c in re.split('([0-9]+)', s)]
Example #53
0
def tokenize(text: str) -> List[str]:
    return re.split(" ", text)
    elif u'’/s' not in s:
        return s.replace(u'‘/s ', '')
    else:
        return s


texts = u''.join(map(clean, sentences))  # 把所有的词拼接起来
print 'Length of texts is %d' % len(texts)
print 'Example of texts: \n', texts[:300]

# file_object = open('train_clean.txt', 'w')
# file_object.write(str(texts.decode('utf-8')))
# file_object.close()

# 重新以标点来划分
sentences = re.split(u'[,。!?、‘’“”]/[bems]', texts)
print 'Sentences number:', len(sentences)
print 'Sentence Example:\n', sentences[0]


def get_Xy(sentence):
    """将 sentence 处理成 [word1, w2, ..wn], [tag1, t2, ...tn]"""
    words_tags = re.findall('(.)/(.)', sentence)
    if words_tags:
        words_tags = np.asarray(words_tags)
        words = words_tags[:, 0]
        tags = words_tags[:, 1]
        return words, tags  # 所有的字和tag分别存为 data / label
    return None

Example #55
0
 def _extractCoreInfo(unique_rep):
     '''
     Return the core information of the given unique representation.
     '''
     return re.split(
         ',', unique_rep[unique_rep.find('[') + 1:unique_rep.find(']')])
Example #56
0
    def greedy_decode(self, sentence, trg_gender=None, max_len=512):
        # vectorizing the src sentence on the char level and word level
        sentence = re.split(r'(\s+)', sentence)
        vectorized_src_sentence_char = [self.src_vocab_char.sos_idx]
        vectorized_src_sentence_word = [self.src_vocab_word.sos_idx]

        for word in sentence:
            for c in word:
                vectorized_src_sentence_char.append(
                    self.src_vocab_char.lookup_token(c))
                vectorized_src_sentence_word.append(
                    self.src_vocab_word.lookup_token(word))

        vectorized_src_sentence_word.append(self.src_vocab_word.eos_idx)
        vectorized_src_sentence_char.append(self.src_vocab_char.eos_idx)

        # getting sentence length
        src_sentence_length = [len(vectorized_src_sentence_char)]

        # vectorizing the trg gender
        if trg_gender:
            vectorized_trg_gender = self.trg_gender_vocab.lookup_token(
                trg_gender)
            vectorized_trg_gender = torch.tensor([vectorized_trg_gender],
                                                 dtype=torch.long)
        else:
            vectorized_trg_gender = None

        # converting the lists to tensors
        vectorized_src_sentence_char = torch.tensor(
            [vectorized_src_sentence_char], dtype=torch.long)
        vectorized_src_sentence_word = torch.tensor(
            [vectorized_src_sentence_word], dtype=torch.long)
        src_sentence_length = torch.tensor(src_sentence_length,
                                           dtype=torch.long)

        # passing the src sentence to the encoder
        with torch.no_grad():
            encoder_outputs, encoder_h_t = self.model.encoder(
                vectorized_src_sentence_char, vectorized_src_sentence_word,
                src_sentence_length)

        # creating attention mask
        attention_mask = self.model.create_mask(vectorized_src_sentence_char,
                                                self.src_vocab_char.pad_idx)

        # initializing the first decoder_h_t to encoder_h_t
        decoder_h_t = encoder_h_t

        context_vectors = torch.zeros(1,
                                      self.model.encoder.rnn.hidden_size * 2)

        # intializing the trg sequences to the <s> token
        trg_seqs = [self.trg_vocab_char.sos_idx]

        with torch.no_grad():
            for i in range(max_len):
                y_t = torch.tensor([trg_seqs[-1]], dtype=torch.long)

                # do a single decoder step
                prediction, decoder_h_t, atten_scores, context_vectors = self.model.decoder(
                    trg_seqs=y_t,
                    encoder_outputs=encoder_outputs,
                    decoder_h_t=decoder_h_t,
                    context_vectors=context_vectors,
                    attention_mask=attention_mask,
                    trg_gender=vectorized_trg_gender)

                # getting the most probable prediciton
                max_pred = torch.argmax(prediction, dim=1).item()

                # if we reach </s> token, stop decoding
                if max_pred == self.trg_vocab_char.eos_idx:
                    break

                trg_seqs.append(max_pred)

        str_sentence = self.get_str_sentence(trg_seqs, self.trg_vocab_char)
        return str_sentence
    if not output_dirs:

        # Make transaction dirs if they don't exist
        # * I have my statements saved in sub dirs by year so this creates those
        output_dirs = sorted([f"{TRANSACTIONS_FOLDER}/{d}" for d in dirs])
        for transaction_dir in output_dirs:
            if not os.path.isdir(transaction_dir):
                os.makedirs(transaction_dir)

    if files:
        for filename in files:
            path = f"{root}/{filename}"
            if os.path.splitext(path)[1] == ".pdf":
                contents = unpack.from_file(path).get("content", "")
                iterator = iter(re.split(f"({'|'.join(keywords)})", contents))

                file_data = []

                for key in iterator:
                    if key in keywords:

                        try:
                            value = next(iterator)

                            if key == TRANSACTIONS_HEADER:

                                # Split by the date format: "Jan 1, 1970"
                                # or 2 new lines
                                split = re.split(
                                    r"(\w{3} \d{1,2}, 20\d{2})|\n\n", value,
Example #58
0
    def clean_optical_configs(self):
        ocs = self.cleaned_data["optical_configs"]
        cleaned = []
        namestore = []
        if self.instance:
            namestore.extend(
                [oc.name for oc in self.instance.optical_configs.all()])
        # on update form allow for the same name (case insensitive)
        brackets = re.compile(r"[\[\]\{\}]")

        def _getpromise(fname):
            fp = FilterPromise(fname)
            if fp.is_valid:
                return fp
            else:
                self.add_error(
                    "optical_configs",
                    "Filter not found in database or at Chroma/Semrock: "
                    "{}".format(fname),
                )
                return None

        def lookup(fname, n=None):
            # lookup filter name in database, then check on chroma/semrock
            if not fname:
                return None
            if isinstance(fname, str) and fname.isdigit():
                if n in (2, 3):
                    self.add_error(
                        "optical_configs",
                        'Laser line (integers) are only accepted in the second position (err: "%s" in position %d)'
                        % (fname, n + 1),
                    )
                elif int(fname) < 300 or int(fname) > 1600:
                    self.add_error(
                        "optical_configs",
                        "Laser wavelengths must be between 300-1600.  Got: %s"
                        % fname,
                    )
                else:
                    return int(fname)
            try:
                return Filter.objects.get(name__icontains=fname)
            except MultipleObjectsReturned:
                try:
                    return Filter.objects.get(part__iexact=fname)
                except ObjectDoesNotExist:
                    return _getpromise(fname)
            except ObjectDoesNotExist:
                return _getpromise(fname)
            return None

        for linenum, line in enumerate(ocs.splitlines()):
            try:
                if not line:
                    continue
                try:
                    if (line.index("{") < line.index(",")) or (
                            line.index("}") < line.index(",")):
                        self.add_error(
                            "optical_configs",
                            "No curly braces allowed in name (line #{})".
                            format(linenum + 1),
                        )
                        continue
                except Exception:
                    pass
                _out = []
                if brackets.search(line):
                    _splt = [
                        i.strip() for i in re.split(r"({[^}]*})", line)
                        if i.strip()
                    ]
                    splt = []
                    for item in _splt:
                        if brackets.search(item):
                            splt.append([
                                n.strip()
                                for n in brackets.sub("", item).split(",")
                                if n.strip()
                            ])
                        else:
                            if item.endswith(","):
                                item = item[:-1]
                            if item.startswith(","):
                                item = item[1:]
                            splt.extend([n.strip() for n in item.split(",")])
                else:
                    splt = [i.strip() for i in line.split(",")]
                if not len(splt) in (4, 5):
                    self.add_error(
                        "optical_configs",
                        "Lines must have 4 or 5 comma-separated fields but this one "
                        "has {}: {}".format(len(splt), line),
                    )
                for n, f in enumerate(splt):
                    if n == 0:
                        if f in namestore:
                            self.add_error(
                                "optical_configs",
                                "Optical config with the name %s already exists."
                                % f,
                            )
                        else:
                            namestore.append(f)
                            _out.append(f)
                    elif n == 4:
                        try:
                            if f.lower() in ("0", "false", "none"):
                                _out.append(False)
                            else:
                                _out.append(True)
                        except Exception:
                            self.add_error(
                                "optical_configs",
                                "Unable to parse Boolean in position 5: %s" %
                                f,
                            )
                    else:
                        if isinstance(f, list):
                            _out.append([lookup(x, n) for x in f])
                        else:
                            _out.append(lookup(f, n))
                cleaned.append(_out)
            except Exception:
                self.add_error(
                    "optical_configs",
                    "Uknown error parsing line #{}: {}".format(
                        linenum + 1, line),
                )
        return cleaned
def search(path_to_index):
    
    stemmer = nltk.stem.SnowballStemmer('english')

    stop_words = {}
    reg = re.compile("\"|,| ")
    stop_file = open("stop_words.txt", "r")
    content = stop_file.read()
    content = re.split(reg, content)
    for word in content :
        if word :
            stop_words[word] = True
            
    title_tags = open(path_to_index+"/title_tags.txt", "r")
    title_position = pickle.load(open(path_to_index+"/title_positions.pickle", "rb"))
    word_position = pickle.load(open(path_to_index+"/word_positions.pickle", "rb"))

    field_map = {"t" : 0, "b" : 1, "i" : 2, "c" : 3}
    field_chars = ["t", "b", "i", "c"] 
    files = []

    for f in field_chars :
        file = path_to_index+ "/" + f + ".txt"
        fp = open(file, "r")
        files.append(fp)
        
#     final_result = []
    while(1) :
        
        query = input()
        # print(query)
        start = time.time()
        result = []
        documents = dict()
        query_words = list()

        # query = query.lower().strip()
#         start = time.time()
#         if (query == "exit") :
#             break

        if ":" in query :
            query_bag = query.split(" ")
            t_result=list()
            flag2=0
            for q in query_bag :
                field_query = q.split(":")
                field = field_query[0]
                query = field_query[1]
                field = mapping_shortform(field)
                query_words = query.split()
                for word in query_words :
                    word = stemmer.stem(word)
                    if word in word_position and field in word_position[word] :
                        position = word_position[word][field]
                        files[field_map[field]].seek(position)
                        intersection=list()
                        s = files[field_map[field]].readline()[:-1] # remove "/n" [:-1] & read full line of posting list
                        if "," in s :
                            items = s.split(",")
                            for item in items :
                                document_score = item.split(":")
                                doc_id = document_score[0]
                                score = document_score[1]
                                tt = 1
                                if doc_id in documents :
                                    documents[doc_id] = documents[doc_id] + float(score)
                                else :
                                    documents[doc_id] = float(score)
                        else :
                            document_score = item.split(":")
                            doc_id = document_score[0]
                            score = document_score[1]
                            tt = 1
                            union_list = list()
                            if doc_id in documents :
                                documents[doc_id] = documents[doc_id] + float(score)
                            else :
                                documents[doc_id] = float(score)
                        

        else :    
            query_bag = query.split()      
            length = len(query_bag)
            for i in range(length) :
                query_bag[i] = stemmer.stem(query_bag[i])
                
            for word in query_bag :
                if word not in stop_words and word in word_position:
                    query_words.append(word)

            for word in query_words :
                docs = list()
                flag2=0
                positions = word_position[word]
                for field in positions.keys() :
                    position = positions[field]
                    intersection=list()
                    files[field_map[field]].seek(position)
                    s = files[field_map[field]].readline()[: -1]
                    if "," in s :
                        items = s.split(",")
                        for item in items :
                            document_score = item.split(":")
                            doc_id = document_score[0]
                            score = document_score[1]
                            tt = 1
                            if doc_id in documents :
                                documents[doc_id] =  documents[doc_id] + float(score)
                            else :
                                documents[doc_id] = float(score)
                                
                    else :
                        document_score = item.split(":")
                        doc_id = document_score[0]
                        score = document_score[1]
                        tt = 1
                        union_list = list()
                        if doc_id in documents :
                            documents[doc_id] =  documents[doc_id] + float(score)
                        else:
                            documents[doc_id] = float(score)
        
        documents = sorted(documents.items(), key = operator.itemgetter(1), reverse = True)
        count = 1
        end = time.time()
        print("Response Time :  " + str(end - start) + " s\n")
        for document in documents :
            
            position = title_position[int(document[0]) - 1]
            title_tags.seek(position)
            title = title_tags.readline()[: -1]
            result.append(title)
            print(title)
            count += 1
            if count > 10 :
                    break
        
        print("\n")
Example #60
0
def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)