Ejemplo n.º 1
0
def danmakuParser(xml: str) -> list:
    """
    Parse xml from method `danmaku`.
    The map in the returned list will contain keys,
    which are "msg", "type", "fontsize", "color",
    "date", "rawdate","pool", "userid" and "danmakuid" .

    Especially, "userid" is special.
    :param xml: str
    :return: list
    """
    tree = ET.fromstringlist(xml)
    danmakulist = []
    for tag in tree:
        if tag.tag == 'd':
            info = {}

            info['msg'] = tag.text

            params = tag.get('p').split(',')
            info['time'] = float(params[0])
            info['type'] = int(params[1])
            info['fontsize'] = int(params[2])
            info['color'] = hex(int(params[3]))
            info['date'] = Time.strftime('%Y-%m-%d %H:%M:%S',
                                         Time.localtime(int(params[4])))
            info['timestamp'] = int(params[4])
            info['pool'] = params[5]
            info['userid'] = params[6]
            info['danmuid'] = params[7]

            danmakulist.append(info)
    return danmakulist
Ejemplo n.º 2
0
def parse_summary(folder_path, num_files, allowed_keys):

    num_files_read = 0
    raw_texts = {}

    for root, dirs, files in os.walk(folder_path):

        summary_file = "perdocs"

        if summary_file in files:
            with open(os.path.join(root, summary_file)) as f:
                it = itertools.chain('<root>', f, '</root>')

                parser = ET.XMLParser(encoding="us-ascii")
                root = ET.fromstringlist(it, parser=parser)

                text_tags = root.findall("SUM")

                for text_tag in text_tags:
                    if text_tag.get("DOCREF") in allowed_keys:
                        raw_texts[text_tag.get("DOCREF")] = text_tag.text

                    if raw_texts.keys() == allowed_keys:
                        break

        else:
            continue

    return raw_texts
Ejemplo n.º 3
0
    def origin_metadata_get(self, project, package):
        meta = ET.fromstringlist(
            osc.core.show_package_meta(self.apiurl, project, package))
        if meta is not None:
            return meta.get('project'), meta.get('name')

        return None, None
Ejemplo n.º 4
0
def analysis2list(xml_txt):
    try:
        import xml.etree.cElementTree as ET
    except:
        import xml.etree.ElementTree as ET
    try:
        root = ET.fromstringlist(xml_txt)
        res = []
        for item in root.iter('item'):
            dict = {}
            for key in key_dic:
                iss_item = item.find(key)
                if iss_item is None:
                    continue
                else:
                    dict[key] = iss_item.text
            res.append(dict)

        issue_res = []
        for issue_dic in res:
            issue = Issue(issue_dic)
            issue_res.append(issue)
        return issue_res
    except:
        print('analysis xml failure.')
        return None
Ejemplo n.º 5
0
def xml_to_dict(str_xml):
    """将xml转换为dict,只支持单层级的xml解析, 如:<xml><name>wukai</name><age>29</age></xml>"""
    dict = {}
    tree = ET.fromstringlist(str_xml)
    for ele in tree.getchildren():
        dict[ele.tag] = ele.text
    return dict
Ejemplo n.º 6
0
def parse_summary(folder_path, num_files, allowed_keys):

  num_files_read = 0
  raw_texts = {}

  for root, dirs, files in os.walk(folder_path):
    
    summary_file = "perdocs"
    
    if summary_file in files:
      with open(os.path.join(root, summary_file)) as f:
        it = itertools.chain('<root>', f, '</root>')

        parser = ET.XMLParser(encoding="us-ascii")
        root = ET.fromstringlist(it, parser=parser)
        
        text_tags = root.findall("SUM")

        for text_tag in text_tags:
          if text_tag.get("DOCREF") in allowed_keys:
            raw_texts[text_tag.get("DOCREF")] = text_tag.text
          
          if raw_texts.keys() == allowed_keys:
            break
      
    else:
      continue

  return raw_texts
Ejemplo n.º 7
0
def list_of_file(filename):
    with gzip.open(filename, "r") as f:
        parser = custom_xml_parser(encoding='utf-8')
        l = f.readlines()

        it = itertools.chain('<root>', [i.decode('utf-8') for i in l],
                             '</root>')
        root = ET.fromstringlist(it, parser=parser)

    lists = []
    doc_id = ""
    for element in root:
        headline = ""
        list1 = []
        doc_id = element.attrib["id"]
        # print(element.attrib["id"])
        for e in element:
            if (e.tag == 'HEADLINE'):
                headline = e.text.strip()

            list2 = [clean_text(i.text) for i in e if e.tag == 'TEXT']
            if len(list2) > 0:
                # lists.append(doc_id, list2)
                lists.append(
                    (headline, list2)) if len(headline) > 0 else lists.append(
                        (doc_id, list2))

        # if len(list1) > 0:
        #     lists.append((headline, list1)) if len(headline) > 0 else lists.append((doc_id, list1))
    return lists
    def get_build_succeeded_packages(self, project):
        """Get the build succeeded packages from `from_prj` project.
        """

        f = osc.core.show_prj_results_meta(self.apiurl, project)
        root = ET.fromstringlist(f)
        #print ET.dump(root)

        failed_multibuild_pacs = []
        pacs = []
        for node in root.findall('result'):
            if node.get('repository') == 'standard' and node.get('arch') == 'x86_64':
                for pacnode in node.findall('status'):
                    if ':' in pacnode.get('package'):
                        mainpac = pacnode.get('package').split(':')[0]
                        if pacnode.get('code') not in ['succeeded', 'excluded']:
                            failed_multibuild_pacs.append(pacnode.get('package'))
                            if mainpac not in failed_multibuild_pacs:
                                failed_multibuild_pacs.append(mainpac)
                            if mainpac in pacs:
                                pacs.remove(mainpac)
                        else:
                            if mainpac in failed_multibuild_pacs:
                                failed_multibuild_pacs.append(pacnode.get('package'))
                            elif mainpac not in pacs:
                                pacs.append(mainpac)
                        continue
                    if pacnode.get('code') == 'succeeded':
                        pacs.append(pacnode.get('package'))
            else:
                logging.error("Can not find standard/x86_64 results")

        return pacs
Ejemplo n.º 9
0
    def sle_workarounds_unneeded_check(self, package):
        # If SLE-workarounds project and package was not sourced from
        # SLE-workarounds, but it does exist in SLE-workarounds.
        if (self.sle_workarounds and not self.sle_workarounds_sourced
                and package in self.packages[self.sle_workarounds]):
            # Determine how recently the package was updated.
            root = ET.fromstringlist(
                get_commitlog(self.apiurl,
                              self.sle_workarounds,
                              package,
                              None,
                              format='xml'))
            updated_last = date_parse(root.find('logentry/date').text)
            age = datetime.now() - updated_last
            if age.total_seconds() < 3600 * 24:
                logger.debug(
                    'skip removal of {}/{} since updated within 24 hours'.
                    format(self.sle_workarounds, package))
                return

            requests = get_request_list(self.apiurl,
                                        self.sle_workarounds,
                                        package,
                                        req_type='submit')
            if len(requests):
                logger.debug('existing submit request involving {}/{}'.format(
                    self.sle_workarounds, package))
                return

            self.delete_request(
                self.sle_workarounds, package,
                'sourced from {}'.format(self.lookup.get(package)))
Ejemplo n.º 10
0
def project_meta_revision(apiurl, project):
    root = ET.fromstringlist(
        get_commitlog(apiurl,
                      project,
                      '_project',
                      None,
                      format='xml',
                      meta=True))
    return int(root.find('logentry').get('revision'))
Ejemplo n.º 11
0
    def _parsexml(cls, response):

        # parse XML into ElementTree element
        try:
            return ElementTree.fromstringlist(response)
        except ParseError as e:
            raise LoonError(
                "Unable to parse response: {0}".format(e)
            )
Ejemplo n.º 12
0
def devel_project_get(apiurl, target_project, target_package):
    try:
        meta = ET.fromstringlist(show_package_meta(apiurl, target_project, target_package))
        node = meta.find('devel')
        if node is not None:
            return node.get('project'), node.get('package')
    except HTTPError as e:
        if e.code != 404:
            raise e

    return None, None
Ejemplo n.º 13
0
def devel_project_get(apiurl, target_project, target_package):
    try:
        meta = ET.fromstringlist(show_package_meta(apiurl, target_project, target_package))
        node = meta.find('devel')
        if node is not None:
            return node.get('project'), node.get('package')
    except HTTPError as e:
        if e.code != 404:
            raise e

    return None, None
Ejemplo n.º 14
0
def repository_path_expand(apiurl, project, repo):
    """Recursively list underlying projects."""
    repos = [[project, repo]]
    meta = ET.fromstringlist(show_project_meta(apiurl, project))
    paths = meta.findall('.//repository[@name="{}"]/path'.format(repo))

    # The listed paths are taken as-is, except for the last one...
    for path in paths[:-1]:
        repos += [[path.get('project', project), path.get('repository')]]

    # ...which is expanded recursively
    if len(paths) > 0:
        repos += repository_path_expand(apiurl, paths[-1].get('project', project), paths[-1].get('repository'))
    return repos
Ejemplo n.º 15
0
def _repository_path_expand(apiurl, project, repo):
    """Recursively list underlying projects."""

    repos = OrderedDict()

    meta = ET.fromstringlist(show_project_meta(apiurl, project))
    for path in meta.findall('.//repository[@name="{}"]/path'.format(repo)):
        rp = repository_path_expand(apiurl, path.get('project', project), path.get('repository'))
        for project, repo in rp:
            # only the last repo for a project is remembered by OBS
            if project in repos:
                del repos[project]
            repos[project] = repo

    return repos
Ejemplo n.º 16
0
    def get_tree(self, path):
        path = self.resolve_path(path)
        if path in self.trees:
            return self.trees[path]

        abspath = os.path.join(self.root, path)
        if not(os.path.exists(abspath) and os.path.isfile(abspath)):
            raise FailedCheck('File does not exist {!r}'.format(path))

        with io.open(abspath, encoding='utf-8') as f:
            try:
                tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
            except Exception as e:
                raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
            self.trees[path] = tree
            return self.trees[path]
Ejemplo n.º 17
0
    def get_tree(self, path):
        path = self.resolve_path(path)
        if path in self.trees:
            return self.trees[path]

        abspath = os.path.join(self.root, path)
        if not(os.path.exists(abspath) and os.path.isfile(abspath)):
            raise FailedCheck('File does not exist {!r}'.format(path))

        with io.open(abspath, encoding='utf-8') as f:
            try:
                tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
            except Exception as e:
                raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
            self.trees[path] = tree
            return self.trees[path]
Ejemplo n.º 18
0
def readdata(path):
    Alldata = []
    N = 0
    files = listdir(path)
    for file in files:
        with open(path+'/'+file) as inputfile:
            File = inputfile.read()
            File = '<root>'+ File + '</root>'
            File = re.sub("[`&\"\']+", '', File)        
            root = ET.fromstringlist(File)
            Docs = root.getchildren()
            for Doc in Docs:
                ID = Doc.find('DOCNO').text
                data = ET.tostring(Doc.find('TEXT'))
                if (len(data.split())<=2):
                    continue
                Alldata.append((ID,data))
                N = N +1 
    return Alldata, N
Ejemplo n.º 19
0
 def __init__(self, xml_file=None, xml_str=None):
     """
     实例化
     优先处理字符串传入rule
     :param xml_file: XML file xml文件路径
     :param xml_str: XML str xml字符串 单行str文本
     """
     if xml_str is not None:
         if re.match('^<\?xml', xml_str) is None:
             xml_str = ''.join(
                 ('<?xml version="1.0" encoding="utf-8" ?>', xml_str))
         if type(xml_str) is list:
             super(XMLEtAnalyzer,
                   self).__init__(element=fromstringlist(xml_str))
         else:
             super(XMLEtAnalyzer,
                   self).__init__(element=fromstring(xml_str))
     else:
         super(XMLEtAnalyzer, self).__init__(file=xml_file)
Ejemplo n.º 20
0
def repository_path_expand(apiurl, project, repo, repos=None):
    """Recursively list underlying projects."""

    if repos is None:
        # Avoids screwy behavior where list as default shares reference for all
        # calls which effectively means the list grows even when new project.
        repos = []

    if [project, repo] in repos:
        # For some reason devel projects such as graphics include the same path
        # twice for openSUSE:Factory/snapshot. Does not hurt anything, but
        # cleaner not to include it twice.
        return repos

    repos.append([project, repo])

    meta = ET.fromstringlist(show_project_meta(apiurl, project))
    for path in meta.findall('.//repository[@name="{}"]/path'.format(repo)):
        repository_path_expand(apiurl, path.get('project', project), path.get('repository'), repos)

    return repos
Ejemplo n.º 21
0
def load_data(input_file):
    contents = []
    passages = []
    reader = open(input_file)
    text = ''
    line = reader.readline()
    while line:
        line = line.strip()
        if line.startswith("</SENTENCE>"):
            passages.append(line)
            sentence = ET.fromstringlist(passages)
            if not text:
                passages.clear()
                continue
            # text = sentence.findtext("TEXT")
            content = {"text": text, "mistakes": []}
            for mistake in sentence.iter("MISTAKE"):
                wrong = mistake.findtext("WRONG")
                correct = mistake.findtext("CORRECTION")
                if wrong == correct:
                    continue
                reform = {
                    "wrong": wrong,
                    "correct": correct,
                    "loc": mistake.findtext("LOCATION")
                }
                content["mistakes"].append(reform)
            if len(content["mistakes"]) > 0:
                contents.append(content)
            passages = []
            text = ''
        elif line.startswith("<TEXT>"):
            text = line[len('<TEXT>'):-len('</TEXT>')]
        elif line:
            passages.append(line)
        line = reader.readline()
    reader.close()
    num = len(contents)
    print(f'{input_file} has loaded, total {num} records')
    return contents
Ejemplo n.º 22
0
def repository_path_expand(apiurl, project, repo, repos=None):
    """Recursively list underlying projects."""

    if repos is None:
        # Avoids screwy behavior where list as default shares reference for all
        # calls which effectively means the list grows even when new project.
        repos = []

    if [project, repo] in repos:
        # For some reason devel projects such as graphics include the same path
        # twice for openSUSE:Factory/snapshot. Does not hurt anything, but
        # cleaner not to include it twice.
        return repos

    repos.append([project, repo])

    meta = ET.fromstringlist(show_project_meta(apiurl, project))
    for path in meta.findall('.//repository[@name="{}"]/path'.format(repo)):
        repository_path_expand(apiurl, path.get('project', project),
                               path.get('repository'), repos)

    return repos
Ejemplo n.º 23
0
    def sle_workarounds_unneeded_check(self, package):
        # If SLE-workarounds project and package was not sourced from
        # SLE-workarounds, but it does exist in SLE-workarounds.
        if (self.sle_workarounds and not self.sle_workarounds_sourced and
            package in self.packages[self.sle_workarounds]):
            # Determine how recently the package was updated.
            root = ET.fromstringlist(
                get_commitlog(self.apiurl, self.sle_workarounds, package, None, format='xml'))
            updated_last = date_parse(root.find('logentry/date').text)
            age = datetime.now() - updated_last
            if age.total_seconds() < 3600 * 24:
                logger.debug('skip removal of {}/{} since updated within 24 hours'.format(
                    self.sle_workarounds, package))
                return

            requests = get_request_list(self.apiurl, self.sle_workarounds, package, req_type='submit')
            if len(requests):
                logger.debug('existing submit request involving {}/{}'.format(self.sle_workarounds, package))
                return

            self.delete_request(self.sle_workarounds, package,
                                'sourced from {}'.format(self.lookup.get(package)))
Ejemplo n.º 24
0
    def get_build_succeeded_packages(self, project):
        """Get the build succeeded packages from `from_prj` project.
        """

        f = osc.core.show_prj_results_meta(self.apiurl, project)
        root = ET.fromstringlist(f)
        # print ET.dump(root)

        failed_multibuild_pacs = []
        pacs = []
        for node in root.findall('result'):
            if node.get('repository') == 'standard' and node.get(
                    'arch') == 'x86_64':
                for pacnode in node.findall('status'):
                    if ':' in pacnode.get('package'):
                        mainpac = pacnode.get('package').split(':')[0]
                        if pacnode.get('code') not in [
                                'succeeded', 'excluded'
                        ]:
                            failed_multibuild_pacs.append(
                                pacnode.get('package'))
                            if mainpac not in failed_multibuild_pacs:
                                failed_multibuild_pacs.append(mainpac)
                            if mainpac in pacs:
                                pacs.remove(mainpac)
                        else:
                            if mainpac in failed_multibuild_pacs:
                                failed_multibuild_pacs.append(
                                    pacnode.get('package'))
                            elif mainpac not in pacs:
                                pacs.append(mainpac)
                        continue
                    if pacnode.get('code') == 'succeeded':
                        pacs.append(pacnode.get('package'))
            else:
                logging.error("Can not find standard/x86_64 results")

        return pacs
Ejemplo n.º 25
0
    'no-cache',
    'cookie':
    cookie,
}

r = requests.get(xml_url, headers=headers, params=params)

xml_txt = r.text
# print(xml_txt)

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

root = ET.fromstringlist(xml_txt)

res = []

# Catch all issue about me
for item in root.iter('item'):
    dict = {}
    for key in key_dic:
        iss_item = item.find(key)
        if iss_item is None:
            continue
        else:
            dict[key] = iss_item.text
    res.append(dict)

# print(res)
Ejemplo n.º 26
0
def read_file(path,
              parse_headline=True, parse_dateline=True,
              parse_coreferences=True, parse_sentences=True,
              parse_text=True):
    with gzip.open(path) as source:
        source.readline()
        # file_line = source.readline() + "</FILE>"
        # file_tag = etree.fromstring(file_line)
        # file_id = file_tag.attrib['id']

        lines = []
        for line in source:
            lines.append(line)

            if line.strip() == '</DOC>':
                lines = ['<xml>'] + lines
                lines.append('</xml>')
                xml = etree.fromstringlist(lines).find('DOC')

                doc_id = xml.attrib['id']
                date_str = doc_id.split('_')[-1].split('.')[0]
                date = parse_ymd(date_str)

                headline_xml = xml.find('HEADLINE')
                if headline_xml is not None and parse_headline:
                    headline = parse_lisp(headline_xml.text.strip())
                else:
                    headline = None

                dateline_xml = xml.find('DATELINE')
                if dateline_xml is not None and parse_dateline:
                    dateline = parse_lisp(dateline_xml.text.strip())
                else:
                    dateline = None

                coreferences = xml.find('coreferences')
                if coreferences is not None and parse_coreferences:
                    coreferences = [[parse_mention(m) for m in x]
                                    for x in coreferences]
                else:
                    coreferences = []

                sentences = xml.find('sentences')
                if sentences is not None and parse_sentences:
                    sentences = [parse_sentence(x)
                                 for x in xml.find('sentences')]
                else:
                    sentences = []

                text = xml.find('TEXT')
                if text is not None and parse_text:
                    text = parse_text(text)
                else:
                    text = None

                yield Document(
                    id=xml.attrib['id'],
                    date=date,
                    type=xml.attrib['type'],
                    headline=headline,
                    dateline=dateline,
                    text=text,
                    sentences=sentences,
                    coreferences=coreferences)
                lines = []
Ejemplo n.º 27
0
def read_file(path,
              parse_headline=True, parse_dateline=True,
              parse_coreferences=True, parse_sentences=True,
              parse_text=True, simple_token=True):
    with gzip.open(path, 'rt') as source:
        source.readline()
        # file_line = source.readline() + "</FILE>"
        # file_tag = etree.fromstring(file_line)
        # file_id = file_tag.attrib['id']

        lines = []
        for line in source:
            lines.append(line)

            if line.strip() == '</DOC>':
                lines = ['<xml>'] + lines
                lines.append('</xml>')
                xml = etree.fromstringlist(lines).find('DOC')

                doc_id = xml.attrib['id']
                date_str = doc_id.split('_')[-1].split('.')[0]
                date = _parse_ymd(date_str)

                headline_xml = xml.find('HEADLINE')
                if headline_xml and parse_headline:
                    headline = _parse_lisp(headline_xml.text.strip())
                else:
                    headline = None

                dateline_xml = xml.find('DATELINE')
                if dateline_xml and parse_dateline:
                    dateline = _parse_lisp(dateline_xml.text.strip())
                else:
                    dateline = None

                coreferences = xml.find('coreferences')
                if coreferences and parse_coreferences:
                    coreferences = [[_parse_mention(m) for m in x]
                                    for x in coreferences]
                else:
                    coreferences = []

                sentences = xml.find('sentences')
                if sentences and parse_sentences:
                    sentences = [_parse_sentence(x, simple_token)
                                 for x in xml.find('sentences')]
                else:
                    sentences = []

                text = xml.find('TEXT')
                if text and parse_text:
                    text = _parse_text(text)
                else:
                    text = None

                yield Document(
                    id=xml.attrib['id'],
                    date=date,
                    type=xml.attrib['type'],
                    headline=headline,
                    dateline=dateline,
                    text=text,
                    sentences=sentences,
                    coreferences=coreferences)
                lines = []
Ejemplo n.º 28
0
def project_locked(apiurl, project):
    meta = ET.fromstringlist(show_project_meta(apiurl, project))
    return meta.find('lock/enable') is not None
Ejemplo n.º 29
0
def project_locked(apiurl, project):
    meta = ET.fromstringlist(show_project_meta(apiurl, project))
    return meta.find('lock/enable') is not None
Ejemplo n.º 30
0
def project_meta_revision(apiurl, project):
    root = ET.fromstringlist(get_commitlog(
        apiurl, project, '_project', None, format='xml', meta=True))
    return int(root.find('logentry').get('revision'))
Ejemplo n.º 31
0
    def fetch(self, xml=None, sequence=None, **kwargs):
        """Get Blast record from url or file.

        :arg sequence: an object with an associated sequence string 
            or a sequence string itself
        :type sequence: :class:`Atomic`, :class:`Sequence`, or str

        :arg xml: blast search results in XML format or an XML file that
            contains the results or a filename for saving the results or None
        :type xml: str

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int
        """
        if self.isSuccess:
            LOGGER.warn(
                "The record already exists so not further search is performed")
            return True

        if sequence is None:
            sequence = self._sequence

        if xml is None:
            xml = self._xml

        import xml.etree.cElementTree as ET
        have_xml = False
        filename = None
        if xml is not None:
            if len(xml) < 100:
                # xml likely contains a filename
                if os.path.isfile(xml):
                    # read the contents
                    try:
                        xml = ET.parse(xml)
                        root = xml.getroot()
                        have_xml = True
                    except:
                        raise ValueError('could not parse xml from xml file')
                else:
                    # xml contains a filename for writing
                    filename = xml
            else:
                try:
                    if isinstance(xml, list):
                        root = ET.fromstringlist(xml)
                    elif isinstance(xml, str):
                        root = ET.fromstring(xml)
                except:
                    raise ValueError(
                        'xml is not a filename and does not look like'
                        ' a valid XML string')
                else:
                    have_xml = True

        if have_xml is False:
            # we still need to run a blast
            headers = {'User-agent': 'ProDy'}
            query = [
                ('DATABASE', 'pdb'),
                ('ENTREZ_QUERY', '(none)'),
                ('PROGRAM', 'blastp'),
            ]

            expect = float(kwargs.pop('expect', 10e-10))
            if expect <= 0:
                raise ValueError('expect must be a positive number')
            query.append(('EXPECT', expect))
            hitlist_size = int(kwargs.pop('hitlist_size', 250))
            if hitlist_size <= 0:
                raise ValueError('expect must be a positive integer')
            query.append(('HITLIST_SIZE', hitlist_size))
            query.append(('QUERY', sequence))
            query.append(('CMD', 'Put'))

            sleep = float(kwargs.pop('sleep', 2))
            timeout = float(kwargs.pop('timeout', self._timeout))
            self._timeout = timeout

            try:
                import urllib.parse
                urlencode = lambda data: bytes(urllib.parse.urlencode(data),
                                               'utf-8')
            except ImportError:
                from urllib import urlencode

            url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

            data = urlencode(query)
            LOGGER.timeit('_prody_blast')
            LOGGER.info(
                'Blast searching NCBI PDB database for "{0}..."'.format(
                    sequence[:5]))
            handle = openURL(url, data=data, headers=headers)

            html = handle.read()
            index = html.find(b'RID =')
            if index == -1:
                raise Exception('NCBI did not return expected response.')
            else:
                last = html.find(b'\n', index)
                rid = html[index + len('RID ='):last].strip()

            query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
                     ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
            data = urlencode(query)

            while True:
                LOGGER.sleep(int(sleep),
                             'to reconnect to NCBI for search results.')
                LOGGER.write('Connecting to NCBI for search results...')
                handle = openURL(url, data=data, headers=headers)
                results = handle.read()
                index = results.find(b'Status=')
                LOGGER.clear()
                if index < 0:
                    break
                last = results.index(b'\n', index)
                status = results[index + len('Status='):last].strip()
                if status.upper() == b'READY':
                    break
                sleep = int(sleep * 1.5)
                if LOGGER.timing('_prody_blast') > timeout:
                    LOGGER.warn('Blast search time out.')
                    return False

            LOGGER.clear()
            LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

            root = ET.XML(results)
            try:
                ext_xml = filename.lower().endswith('.xml')
            except AttributeError:
                pass
            else:
                if not ext_xml:
                    filename += '.xml'
                out = open(filename, 'w')
                if PY3K:
                    out.write(results.decode())
                else:
                    out.write(results)
                out.close()
                LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

        root = dictElement(root, 'BlastOutput_')

        if root['db'] != 'pdb':
            raise ValueError('blast search database in xml must be "pdb"')
        if root['program'] != 'blastp':
            raise ValueError('blast search program in xml must be "blastp"')
        self._param = dictElement(root['param'][0], 'Parameters_')

        query_len = int(root['query-len'])
        if sequence and len(sequence) != query_len:
            raise ValueError('query-len and the length of the sequence do not '
                             'match, xml data may not be for given sequence')
        hits = []
        for iteration in root['iterations']:
            for hit in dictElement(iteration, 'Iteration_')['hits']:
                hit = dictElement(hit, 'Hit_')
                data = dictElement(hit['hsps'][0], 'Hsp_')
                for key in [
                        'align-len', 'gaps', 'hit-frame', 'hit-from', 'hit-to',
                        'identity', 'positive', 'query-frame', 'query-from',
                        'query-to'
                ]:
                    data[key] = int(data[key])
                data['query-len'] = query_len
                for key in ['evalue', 'bit-score', 'score']:
                    data[key] = float(data[key])
                p_identity = 100.0 * data['identity'] / (
                    data['query-to'] - data['query-from'] + 1)
                data['percent_identity'] = p_identity
                p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                             query_len)
                data['percent_coverage'] = p_overlap

                for item in (hit['id'] + hit['def']).split('>gi'):
                    head, title = item.split(None, 1)
                    head = head.split('|')
                    pdb_id = head[-2].lower()
                    chain_id = head[-1][:1]
                    pdbch = dict(data)
                    pdbch['pdb_id'] = pdb_id
                    pdbch['chain_id'] = chain_id
                    pdbch['title'] = (head[-1][1:] + title).strip()
                    hits.append((p_identity, p_overlap, pdbch))
        hits.sort(key=lambda hit: hit[0], reverse=True)
        self._hits = hits

        return True
Ejemplo n.º 32
0
def read_file(path,
              p_headline=True, p_dateline=True,
              p_coreferences=True, p_sentences=True,
              p_text=True):
    amp = re.compile(r'&amp;', re.IGNORECASE)
    bamp = re.compile(r'&')
    with gzip.open(path) as source:
        # source.readline()
        # file_line = source.readline() + "</FILE>"
        # file_tag = etree.fromstring(file_line)
        # file_id = file_tag.attrib['id']

        lines = []
        for line in source:
            # fix ampersand escape
            lines.append(bamp.sub('&amp;', amp.sub('&', line)))
            # lines.append(line)

            if line.strip() == '</DOC>':
                lines = ['<xml>'] + lines
                lines.append('</xml>')
                # print 80 * '='
                # for ln in lines:
                #     print ln
                # print 80 * '='
                xml = etree.fromstringlist(lines).find('DOC')

                doc_id = xml.attrib['id']
                date_str = doc_id.split('_')[-1].split('.')[0]
                date = parse_ymd(date_str)

                headline_xml = xml.find('HEADLINE')
                if headline_xml is not None and p_headline:
                    headline = headline_xml.text.strip()
                else:
                    headline = None

                dateline_xml = xml.find('DATELINE')
                if dateline_xml is not None and p_dateline:
                    dateline = dateline_xml.text.strip()
                else:
                    dateline = None

                coreferences = xml.find('coreferences')
                if coreferences is not None and p_coreferences:
                    coreferences = [[parse_mention(m) for m in x]
                                    for x in coreferences]
                else:
                    coreferences = []

                sentences = xml.find('sentences')
                if sentences is not None and p_sentences:
                    sentences = [parse_sentence(x)
                                 for x in xml.find('sentences')]
                else:
                    sentences = []

                text = xml.find('TEXT')
                if text is not None and p_text:
                    text = parse_text(text)
                else:
                    text = None

                yield Document(
                    id=xml.attrib['id'],
                    date=date,
                    type=xml.attrib['type'],
                    headline=headline,
                    dateline=dateline,
                    text=text,
                    sentences=sentences,
                    coreferences=coreferences)
                lines = []
import xml.etree.cElementTree as ET
data = '''
<ComponentRoot name="ComponentLayout">
	<Component name="RSBBStartEndCaseButton" style="StartEndCaseButtonStyle">
		<Size name = "Default">
			<Item name = "Group">
				<Width>85.5</Width>
				<Height>50</Height>
				<Radius>2</Radius>
			</Item>
			<Item name="ButtonText" type = "text"  translationProp = "Default">
				<width>85.5</width>
				<Height>50</Height>
				<LeftMargin>0</LeftMargin>
				<TopMargin>0</TopMargin>
				<NumberLines>1</NumberLines>
				<FontSize>10</FontSize>
				<FontType>Normal</FontType>
			</Item>
		</Size>
    </Component>
</ComponentRoot>
'''

myroot = ET.fromstringlist(data)
print(myroot.tag)