def get_fields(strng, strict=False): """ Returns a list with pairs (field, value) from strng If strict is True, it will only allow known fields, defined in helper.bibtexfields """ comma_rex = re.compile(r'\s*[,]') ss = strng.strip() if not ss.endswith(','): # Add the last commma if missing ss += ',' fields = [] while True: name, sep, ss = ss.partition('=') name = name.strip().lower( ) # This should be enough if there is no error in the entry if len( name.split() ) > 1: # Help recover from errors. name should be only one word anyway name = name.split()[-1] ss = ss.strip() if sep == '': break # We reached the end of the string if ss[0] == '{': # The value is surrounded by '{}' s, e = helper.match_pair(ss) data = ss[s + 1:e - 1].strip() elif ss[0] == '"': # The value is surrounded by '"' s = ss.find(r'"') e = ss.find(r'"', s + 1) data = ss[s + 1:e].strip() else: # It should be a number or something involving a string e = ss.find(',') data = ss[0:e].strip() if not data.isdigit(): # Then should be some string dd = data.split('#') # Test for joined strings if len(dd) > 1: for n in range(len(dd)): dd[n] = dd[n].strip() dd[n] = dd[n].replace('{', '"').replace('}', '"') if dd[n][0] != '"': dd[n] = 'definitionofstring(%s) ' % (dd[n]) data = '#'.join(dd) else: data = 'definitionofstring(%s) ' % (data.strip()) s = ss[e].find(',') ss = ss[s + e + 1:] # JF: Temporario, descomentar si hay problemas # if name=='title': # data=helper.capitalizestring(data) # else: # data=helper.removebraces(data) if not strict or name in helper.bibtexfields: fields.append((name, data)) return fields
def parsedata(data): """ Parses a string with a bibtex database """ # Regular expressions to use pub_rex = re.compile( '\s?@(\w*)\s*[{\(]') # A '@' followed by any word and an opening # brace or parenthesis ######################################################################################## #################### Reformat the string #################### ss = re.sub('\s+', ' ', data).strip() # Find entries strings = {} preamble = [] comment = [] tmpentries = [] entries = {} while True: entry = {} m = pub_rex.search(ss) if m == None: break if m.group(0)[-1] == '(': d = helper.match_pair(ss, pair=('[(]', '[)]'), start=m.end() - 1) else: d = helper.match_pair(ss, start=m.end() - 1) if d != None: current = ss[m.start():d[1] - 1] # Currently analyzed entry st, entry = parseentry(current) if st != None: strings.update(st) if entry != None and entry != {}: entries[entry['_code']] = entry ss = ss[d[1] + 1:].strip() return strings, entries
def get_fields(strng, strict=False): """ Returns a list with pairs (field, value) from strng If strict is True, it will only allow known fields, defined in helper.bibtexfields """ comma_rex=re.compile(r'\s*[,]') ss=strng.strip() if not ss.endswith(','): # Add the last commma if missing ss+=',' fields=[] while True: name,sep,ss= ss.partition('=') name=name.strip().lower() # This should be enough if there is no error in the entry if len(name.split()) > 1: # Help recover from errors. name should be only one word anyway name= name.split()[-1] ss=ss.strip() if sep == '': break # We reached the end of the string if ss[0] == '{': # The value is surrounded by '{}' s,e= helper.match_pair(ss) data= ss[s+1:e-1].strip() elif ss[0] == '"': # The value is surrounded by '"' s= ss.find(r'"') e= ss.find(r'"',s+1) data= ss[s+1:e].strip() else: # It should be a number or something involving a string e= ss.find(',') data= ss[0:e].strip() if not data.isdigit(): # Then should be some string dd=data.split('#') # Test for joined strings if len(dd) > 1: for n in range(len(dd)): dd[n]= dd[n].strip() dd[n]= dd[n].replace('{','"').replace('}','"') if dd[n][0] != '"': dd[n]='definitionofstring(%s) '%(dd[n]) data='#'.join(dd) else: data='definitionofstring(%s) '%(data.strip()) s=ss[e].find(',') ss=ss[s+e+1:] # JF: Temporario, descomentar si hay problemas # if name=='title': # data=helper.capitalizestring(data) # else: # data=helper.removebraces(data) if not strict or name in helper.bibtexfields: fields.append((name,data)) return fields
def parsedata(data): """ Parses a string with a bibtex database """ # Regular expressions to use pub_rex = re.compile('\s?@(\w*)\s*[{\(]') # A '@' followed by any word and an opening # brace or parenthesis ######################################################################################## #################### Reformat the string #################### ss= re.sub('\s+',' ',data).strip() # Find entries strings={} preamble=[] comment=[] tmpentries=[] entries={} while True: entry={} m= pub_rex.search(ss) if m == None: break if m.group(0)[-1]=='(': d= helper.match_pair(ss,pair=('[(]','[)]'),start=m.end()-1) else: d= helper.match_pair(ss,start=m.end()-1) if d != None: current= ss[m.start():d[1]-1] # Currently analyzed entry st,entry= parseentry(current) if st != None: strings.update(st) if entry != None and entry != {}: entries[entry['_code']]= entry ss=ss[d[1]+1:].strip() return strings,entries
def get_fields(strng): f = strng.find('=') braces_rex = re.compile(r'\s*[{]') comilla_rex = re.compile(r'\s*["]') start = 0 fields = [] end = len(strng) # start holds the current position in the strng # f : position of equal sign # s : position of {, opening " or first line after the equal sign # e : position of closing }, " or next comma while f != -1 and start < end: name = string.strip(strng[start:f]).lower() if name != '': ss = strng[f + 1:] if braces_rex.match(ss): s, e = match_pair(ss) data = ss[s + 1:e - 1].strip() elif comilla_rex.match(ss): s = string.find(ss, r'"') e = string.find(ss, r'"', s + 1) data = ss[s + 1:e].strip() else: s = 1 e = ss.find(',') data = ss[s:e].strip() fields.append((name, data)) # There is trailing comma, we should take it out e = ss.find(',', e) + 1 start = f + e + 2 f = string.find(strng, '=', start) return fields
def bibtexload(filecontents_source): space_rex = re.compile('\s+') pubtype_rex = re.compile('\W?@(\w*)\s*{\s*([^,]*),') pub_rex = re.compile('\W?@(\w*)\s*{') filecontents = [] # remove trailing and excessive whitespace for line in filecontents_source: line = string.strip(line) line = space_rex.sub(' ', line) filecontents.append(' ' + line) filecontents = string.join(filecontents, '') # the file is in one long string filecontents = no_outer_parens(filecontents) # character encoding, reserved latex characters filecontents = re.sub('{\\\&}', '&', filecontents) filecontents = re.sub('\\\&', '&', filecontents) filecontents = filecontents.strip() # # Find entries # strings = [] preamble = [] comment = [] entries = {} start = 0 s = 0 e = 0 final = len(filecontents) - 1 while start < final: entry = {} m = pub_rex.search(filecontents[start:]) if m: start += m.start() arttype = string.lower(pub_rex.sub('\g<1>', m.group())) d = match_pair(filecontents[start:]) if d: s, e = d s += start + 1 e += (start - 1) # current has the currently analyzed entry current = filecontents[s:e] if arttype == 'string': name, defin = string.split(current, "=") defin = defin.replace('"', '').replace(' ', ' ') strings.append((name.strip(), defin.strip())) elif arttype == 'comment' or arttype == 'preamble': pass # print '# '+ arttype else: p = re.match('([^,]+),', current) artid = p.group()[:-1] entry['type'] = arttype entry['id'] = artid current = current[p.end():] ff = get_fields(current) for n, d in ff: entry[n] = d entries[artid] = entry start = e else: return strings, entries return strings, entries
def bibtexload(filecontents_source): space_rex = re.compile('\s+') pub_rex = re.compile('\W?@(\w*)\s*{') filecontents = [] # remove trailing and excessive whitespace # ignore comments for line in filecontents_source: line = string.strip(line) line = space_rex.sub(' ', line) # ignore comments filecontents.append(' ' + line) filecontents = string.join(filecontents, '') # the file is in one long string filecontents = no_outer_parens(filecontents) # character encoding, reserved latex characters filecontents = re.sub('{\\\&}', '&', filecontents) filecontents = re.sub('\\\&', '&', filecontents) filecontents = filecontents.strip() # # Find entries # strings = [] entries = {} s = 0 e = 0 start = 0 final = len(filecontents) - 1 while start < final: entry = {} m = pub_rex.search(filecontents[start:]) if m: start += m.start() arttype = string.lower(pub_rex.sub('\g<1>', m.group())) d = match_pair(filecontents[start:]) if d: s, e = d s += start + 1 e += (start - 1) # current has the currently analyzed entry current = filecontents[s:e] if arttype == 'string': name, defin = string.split(current, "=") defin = defin.replace('"', '').replace(' ', ' ') strings.append((name.strip(), defin.strip())) elif arttype == 'comment' or arttype == 'preamble': pass # print '# '+ arttype else: p = re.match('([^,]+),', current) artid = p.group()[:-1] entry['type'] = arttype entry['id'] = artid current = current[p.end():] ff = get_fields(current) for n, d in ff: entry[n] = d entries[artid] = entry start = e else: return strings, entries return strings, entries
def authors(data): tokenized = [] a = [] sticky = (None, "") #determine the case of the word for i in re.finditer("(?P<caseless>[{\\\][^,\s]*)|(?P<separator>,)" "|(?P<word>[^\s,]+)|(?P<space>\s)", data): if not sticky[0] and re.search("{", i.group(0)) \ and not match_pair(i.group(0)): # brace not closed? if i.group("caseless"): sticky = ("caseless", i.group(0)) elif i.group("word"): sticky = ("word", i.group(0)) continue elif sticky[0] and not match_pair(sticky[1] + i.group(0)): sticky = (sticky[0], sticky[1] + i.group(0)) continue if sticky[0]: match = sticky[1] + i.group(0) token = sticky[0] sticky = (None, "") else: match = i.group(0) if i.group("caseless"): token = "caseless" if i.group("word"): token = "word" if i.group("separator"): a.append("separator") token = "separator" if i.group("space"): token = "space" if token == "caseless": m = (0, 0) caseless = match while m: m = match_pair(caseless) if m and m[0] == 0: caseless = caseless[m[1]:] else: break w = re.search("[\w]", caseless) if len(caseless) > 0 and w: if w.group(0).islower() or w.group(0).isdigit(): a.append(("lowercase", match)) else: a.append(("uppercase", match)) else: a.append(("caseless", match)) elif token == "word": if match == "and": tokenized.append(a) a = [] elif match[0].islower() or match[0].isdigit(): a.append(("lowercase", match)) else: a.append(("uppercase", match)) if sticky[0]: pass #raise Exception("Brace error!") tokenized.append(a) #determine the cite structure ret = [] for author in tokenized: count = author.count("separator") a = {"first": "", "von": "", "last": "", "jr": ""} #First von Last if count == 0: index = 0 #first for index, word in enumerate(author): if index + 1 < len(author) and word[0] != "lowercase": a["first"] += " " + word[1] else: author = author[index:] break #von caseless = [] for index, word in enumerate(author): if index + 1 < len(author) and word[0] != "uppercase": if word[0] == "caseless": caseless.append(word[1]) elif word[0] == "lowercase": for w in caseless: a["von"] += " " + w caseless = [] a["von"] += " " + word[1] else: author = author[index:] #last for word in caseless: a["last"] += " " + word for index, word in enumerate(author): a["last"] += " " + word[1] #von Last, [jr ,] First elif count > 0: #von upper = [] for index, word in enumerate(author): if author[index + 1] == "separator": upper.append(word[1]) author = author[index + 2:] break if word == "uppercase": upper.append(word) elif word != "separator": for w in upper: a["von"] += " " + w upper = [] a["von"] += " " + word[1] else: author = author[index + 1:] break #last for word in upper: a["last"] += " " + word #jr if count > 1: for index, word in enumerate(author): if word != "separator": a["jr"] += " " + word[1] else: author = author[index + 1:] break #first for index, word in enumerate(author): if word != "separator": a["first"] += " " + word[1] else: a["first"] += "," elif count > 1: pass b = {} for k in a: if len(a[k]) > 0: b[k] = a[k] b[k] = b[k].lstrip() ret.append(b) return ret
def authors(data): tokenized = [] a = [] sticky = (None, "") #determine the case of the word for i in re.finditer( "(?P<caseless>[{\\\][^,\s]*)|(?P<separator>,)" "|(?P<word>[^\s,]+)|(?P<space>\s)", data): if not sticky[0] and re.search("{", i.group(0)) \ and not match_pair(i.group(0)): # brace not closed? if i.group("caseless"): sticky = ("caseless", i.group(0)) elif i.group("word"): sticky = ("word", i.group(0)) continue elif sticky[0] and not match_pair(sticky[1] + i.group(0)): sticky = (sticky[0], sticky[1] + i.group(0)) continue if sticky[0]: match = sticky[1] + i.group(0) token = sticky[0] sticky = (None, "") else: match = i.group(0) if i.group("caseless"): token = "caseless" if i.group("word"): token = "word" if i.group("separator"): a.append("separator") token = "separator" if i.group("space"): token = "space" if token == "caseless": m = (0, 0) caseless = match while m: m = match_pair(caseless) if m and m[0] == 0: caseless = caseless[m[1]:] else: break w = re.search("[\w]", caseless) if len(caseless) > 0 and w: if w.group(0).islower() or w.group(0).isdigit(): a.append(("lowercase", match)) else: a.append(("uppercase", match)) else: a.append(("caseless", match)) elif token == "word": if match == "and": tokenized.append(a) a = [] elif match[0].islower() or match[0].isdigit(): a.append(("lowercase", match)) else: a.append(("uppercase", match)) if sticky[0]: pass #raise Exception("Brace error!") tokenized.append(a) #determine the cite structure ret = [] for author in tokenized: count = author.count("separator") a = {"first": "", "von": "", "last": "", "jr": ""} #First von Last if count == 0: index = 0 #first for index, word in enumerate(author): if index + 1 < len(author) and word[0] != "lowercase": a["first"] += " " + word[1] else: author = author[index:] break #von caseless = [] for index, word in enumerate(author): if index + 1 < len(author) and word[0] != "uppercase": if word[0] == "caseless": caseless.append(word[1]) elif word[0] == "lowercase": for w in caseless: a["von"] += " " + w caseless = [] a["von"] += " " + word[1] else: author = author[index:] #last for word in caseless: a["last"] += " " + word for index, word in enumerate(author): a["last"] += " " + word[1] #von Last, [jr ,] First elif count > 0: #von upper = [] for index, word in enumerate(author): if author[index + 1] == "separator": upper.append(word[1]) author = author[index + 2:] break if word == "uppercase": upper.append(word) elif word != "separator": for w in upper: a["von"] += " " + w upper = [] a["von"] += " " + word[1] else: author = author[index + 1:] break #last for word in upper: a["last"] += " " + word #jr if count > 1: for index, word in enumerate(author): if word != "separator": a["jr"] += " " + word[1] else: author = author[index + 1:] break #first for index, word in enumerate(author): if word != "separator": a["first"] += " " + word[1] else: a["first"] += "," elif count > 1: pass b = {} for k in a: if len(a[k]) > 0: b[k] = a[k] b[k] = b[k].lstrip() ret.append(b) return ret