コード例 #1
0
ファイル: snl.py プロジェクト: buriedwood/00_workSpace
def is_valid_bibtex(reference):
    """
    Use pybtex to validate that a reference is in proper BibTeX format

    Args:
        reference: A String reference in BibTeX format.

    Returns:
        Boolean indicating if reference is valid bibtex.
    """
    # str is necessary since pybtex seems to have an issue with unicode. The
    # filter expression removes all non-ASCII characters.
    sio = io.StringIO(remove_non_ascii(reference))
    parser = bibtex.Parser()
    bib_data = parser.parse_stream(sio)
    return len(bib_data.entries) > 0
コード例 #2
0
def is_valid_bibtex(reference):
    """
    Use pybtex to validate that a reference is in proper BibTeX format

    Args:
        reference: A String reference in BibTeX format.

    Returns:
        Boolean indicating if reference is valid bibtex.
    """
    # str is necessary since pybtex seems to have an issue with unicode. The
    # filter expression removes all non-ASCII characters.
    sio = cStringIO.StringIO(remove_non_ascii(reference))
    parser = bibtex.Parser()
    bib_data = parser.parse_stream(sio)
    return len(bib_data.entries) > 0
コード例 #3
0
ファイル: cifio.py プロジェクト: zacharygibbs/pymatgen
def _clean_cif(s):
    """
    Removes non-ASCII and some unsupported _cgraph fields from the cif
    string
    """
    clean = []
    lines = s.split("\n")
    skip = False
    while len(lines) > 0:
        l = lines.pop(0)
        if skip:
            if l.strip().startswith("_") or l.strip() == "loop_":
                skip = False
            else:
                continue

        if l.strip().startswith("_cgraph"):
            skip = True
        elif not l.strip().startswith("_eof"):
            clean.append(remove_non_ascii(l))

    return "\n".join(clean)
コード例 #4
0
def _clean_cif(s):
    """
    Removes non-ASCII and some unsupported _cgraph fields from the cif
    string
    """
    clean = []
    lines = s.split("\n")
    skip = False
    while len(lines) > 0:
        l = lines.pop(0)
        if skip:
            if l.strip().startswith("_") or l.strip() == "loop_":
                skip = False
            else:
                continue

        if l.strip().startswith("_cgraph"):
            skip = True
        elif not l.strip().startswith("_eof"):
            clean.append(remove_non_ascii(l))

    return "\n".join(clean)
コード例 #5
0
ファイル: cif.py プロジェクト: shyamd/pymatgen
    def _process_string(cls, string):
        # remove comments
        string = re.sub("(\s|^)#.*$", "", string, flags=re.MULTILINE)
        # remove empty lines
        string = re.sub("^\s*\n", "", string, flags=re.MULTILINE)
        # remove non_ascii
        string = remove_non_ascii(string)

        # since line breaks in .cif files are mostly meaningless,
        # break up into a stream of tokens to parse, rejoining multiline
        # strings (between semicolons)
        q = deque()
        multiline = False
        ml = []
        # this regex splits on spaces, except when in quotes.
        # starting quotes must not be preceded by non-whitespace
        # (these get eaten by the first expression)
        # ending quotes must not be followed by non-whitespace
        p = re.compile(r'''([^'"\s][\S]*)|'(.*?)'(?!\S)|"(.*?)"(?!\S)''')
        for l in string.splitlines():
            if multiline:
                if l.startswith(";"):
                    multiline = False
                    q.append(('', '', '', ' '.join(ml)))
                    ml = []
                    l = l[1:].strip()
                else:
                    ml.append(l)
                    continue
            if l.startswith(";"):
                multiline = True
                ml.append(l[1:].strip())
            else:
                for s in p.findall(l):
                    # s is tuple. location of the data in the tuple
                    # depends on whether it was quoted in the input
                    q.append(s)
        return q
コード例 #6
0
    def _process_string(cls, string):
        #remove comments
        string = re.sub("(\s|^)#.*$", "", string, flags=re.MULTILINE)
        #remove empty lines
        string = re.sub("^\s*\n", "", string, flags=re.MULTILINE)
        #remove non_ascii
        string = remove_non_ascii(string)

        #since line breaks in .cif files are mostly meaningless,
        #break up into a stream of tokens to parse, rejoining multiline
        #strings (between semicolons)
        q = deque()
        multiline = False
        ml = []
        # this regex splits on spaces, except when in quotes.
        # starting quotes must not be preceded by non-whitespace
        # (these get eaten by the first expression)
        # ending quotes must not be followed by non-whitespace
        p = re.compile(r'''([^'"\s][\S]*)|'(.*?)'(?!\S)|"(.*?)"(?!\S)''')
        for l in string.splitlines():
            if multiline:
                if l.startswith(";"):
                    multiline = False
                    q.append(('', '', '', ' '.join(ml)))
                    ml = []
                    l = l[1:].strip()
                else:
                    ml.append(l)
                    continue
            if l.startswith(";"):
                multiline = True
                ml.append(l[1:].strip())
            else:
                for s in p.findall(l):
                    # s is tuple. location of the data in the tuple
                    # depends on whether it was quoted in the input
                    q.append(s)
        return q
コード例 #7
0
ファイル: cifio.py プロジェクト: antoinedewandre/pymatgen
 def _process_string(cls, string):
     #remove comments
     string = re.sub("#.*", "", string)
     #remove empty lines
     string = re.sub("^\s*\n", "", string, flags=re.MULTILINE)
     #remove whitespaces at beginning of lines
     string = re.sub("^\s*", "", string, flags=re.MULTILINE)
     #remove non_ascii
     string = remove_non_ascii(string)
     
     #since line breaks in .cif files are mostly meaningless,
     #break up into a stream of tokens to parse, rejoining multiline
     #strings (between semicolons)
     q = deque()
     multiline = False
     ml = []
     #this regex splits on spaces, except when in quotes.
     #it also ignores single quotes when surrounded by non-whitespace
     #since they are sometimes used in author names
     p = re.compile(r'''([^'"\s]+)|'((?:\S'\S|[^'])*)'|"([^"]*)"''')
     for l in string.splitlines():
         if multiline:
             if l.startswith(";"):
                 multiline = False
                 q.append(" ".join(ml))
                 ml = []
                 l = l[1:].strip()
             else:
                 ml.append(l)
                 continue
         if l.startswith(";"):
             multiline = True
             ml.append(l[1:].strip())
         else:
             for s in p.findall(l):
                 q.append(''.join(s))
     return q
コード例 #8
0
ファイル: test_string.py プロジェクト: Rusjava/1D_TBC
 def test_remove_non_ascii(self):
     s = "".join(chr(random.randint(0, 127)) for i in range(10))
     s += "".join(chr(random.randint(128, 150)) for i in range(10))
     clean = remove_non_ascii(s)
     self.assertEqual(len(clean), 10)
コード例 #9
0
ファイル: test_string.py プロジェクト: davidwaroquiers/monty
 def test_remove_non_ascii(self):
     s = "".join(chr(random.randint(0, 127)) for i in range(10))
     s += "".join(chr(random.randint(128, 150)) for i in range(10))
     clean = remove_non_ascii(s)
     self.assertEqual(len(clean), 10)