def __init__(self): #patterns self._pat_abstract_1 = '(Abstract|ABSTRACT)' self._pat_abstract_2 = '(In this|This (paper|study|article|report)' self._pat_abstract_2 += '|IN THIS|THIS (PAPER|STUDY|ARTICLE|REPORT))' self._pat_abstract_end = '(!A_E!|$|\n\s*\n)' self._pat_keywords = 'I[nN][dD][eE][xX] ?[tT][eE][rR][mM][sS]|' self._pat_keywords += 'K[eE][yY] ?[wW][oO][rR][dD][sS](?: and [A-Za-z]*?)?|' self._pat_keywords += 'G[eE][nN][eE][rR][aA][lL] ?[tT][eE][rR][mM][sS]?' self.pat_months = 'janu?a?r?y?\.?|febr?u?a?r?y?\.?|marc?h?\.?|apri?l?\.?|' self.pat_months += 'may|june?\.?|july?\.?|augu?s?t?\.?|sept?e?m?b?e?r?\.?|' self.pat_months += 'octo?b?e?r?\.?|nove?m?b?e?r?\.?|dece?m?b?e?r?\.?' self.pat_junctions = 'the|be|a|an|anthe|of|in|at|for|from|to|into|and|or|with' self.pat_middle_name = '(ter|Ter|van|den|der|de|di|la|van der|von|chen|' self.pat_middle_name += 'van de|van den|Van|Den|Der|De|Di|La|Van der|Von|' self.pat_middle_name += 'Chen|Van de|Van den|el|El)' self.alone_name = \ '([eE]t[. ]*?al\.?|[Ss]ons?|[Jj]r[ .,]|[Jj]unior|[eE]tc\.?)' self.roman_num = '(I[ .]|II[ .]|III[ .]|IV[ .]|V[ .]|VI[ .]|VII[ .]|' self.roman_num += 'VIII[ .]|IX[ .]|X[ .])' #regular expressions self._re_abstract_1 = re.compile(self._pat_abstract_1 + '\W+(.+?)' + self._pat_abstract_end, re.DOTALL) self._re_abstract_2 = re.compile(self._pat_abstract_2 + '\W+(.+?)' + self._pat_abstract_end, re.DOTALL) self._re_keywords_1 = re.compile('(' + self._pat_keywords + ')[-:,;. ]*?\n+(.+?)\n') self._re_keywords_2 = re.compile('(' + self._pat_keywords + ')[-:,;. ]+(.+?)(\n|[^0-9A-Z]\.|$)') self._re_meta = re.compile('(^.*)(authors?:|emails?:|editors?:)', re.I) self.re_published = re.compile('(^|\s)published (in|with)', re.I) self.re_issn = re.compile('ISSN|[0-9]{4}-[0-9]{4}') self.re_rep = re.compile('(^\s*[a-z]{2,} report\s*$|tech report)', re.I) self.re_lower_start = re.compile('^[a-z][^A-Z-\'`]') self.re_vol = re.compile('vol(ume)?\.? *[0-9]+', re.I) self.re_rev = re.compile('[Rr]evision *[0-9]+') self.re_inproc = \ re.compile('in proc\.|in proceedings|proceedings of', re.I) self.re_no = re.compile('n(o|umber)\.? *[0-9]+', re.I) self.re_pages = re.compile('(pages?|p\.) [0-9]', re.I) self.re_etal = re.compile('et al\.?(\s|$)', re.I) self.re_copyright = re.compile('(^|\s)copyright(\s|$)', re.I) self.re_date = re.compile('(' + self.pat_months + ') .*?[0-9]{4}', re.I) self.re_year = re.compile('\([0-9]{4}\)') self.re_num = re.compile('^[0-9\s%]+$', re.DOTALL) self.re_noletter = re.compile('^[^a-z]+$', re.IGNORECASE) self.re_upper_word = re.compile('[A-Z][A-Z]+') self.re_lower_word = re.compile('[a-z]') self.re_telfax = re.compile('(tel|fax) +[+]?[0-9][-0-9 ()]', re.I) self.re_empty = re.compile('^\s*$', re.DOTALL) self.re_notitle = re.compile('\s(conference|symposium)\s', re.M | re.I) self.re_title = re.compile('[A-Z][A-Z]+( [A-Z][A-Z])* [0-9]{4}') self.re_mark = \ re.compile('(^[^\s]+ ?(/|-[0-9]) ?[^\s]+$|[0-9]+[-:][0-9]+)') self.re_type = \ re.compile('(^|\s)thesis|article|journal(\s|$)', re.I | re.DOTALL) self.re_organization = \ re.compile('department of|university|school of', re.IGNORECASE) self.re_press = \ re.compile('(^|[^0-9a-z])in press([^0-9a-z]|$)', re.IGNORECASE) self.re_domain = re.compile('.+\.[a-z]{2,3}$', re.IGNORECASE) self.re_zav = re.compile('^[[(].*[])]$') self.re_title = re.compile('^.*?T[Ii][Tt][Ll][Ee]:\s*(.+?)(\.|$)') self.re_authors = re.compile('^[ ,.&]*(' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(' #Jmeno von Prijmeni '(([A-Z][-A-Za-z\'´`]+)[.,]?[ ]+)' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(([A-Z][-A-Za-z\'´`]+)[ ]*)' '|' #Prijmeni, J. '(([A-Z][-A-Za-z\'´`]+[.,]?)[ ]+)' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '((' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z][ .]?)[ ]*)+?' '|' #J. von Prijmeni '((' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)+?' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(([A-Z][-A-Za-z\'´`]+)[ ]*)' ')' '(([A-Z]v+|' + self.pat_middle_name + '|([A-Z]\.?-)?[A-Z]\.)[ ]*)*?' '(([., ]*(' + self.alone_name + '|' + self.roman_num + ')?)*' '((?<=\.) |&|[,. ]?and[,. ]|,|\.|;|$|%SEP%|\())+?' '([., ]*' + self.alone_name + ')*)', re.VERBOSE) self.re_inc = re.compile('[^a-zA-Z]([A-Z]\.|' + self.pat_middle_name + '|' + self.alone_name + '|&)[^a-zA-Z]') self.re_end = re.compile('([-:,]| (' + self.pat_junctions + '))\s*\n', re.IGNORECASE) self.re_split = \ re.compile('^[A-Z]+( [^ a-z]+)*\n([^ a-z]+( [^ a-z]+)*\n)+') self.re_upper = re.compile('([A-Z][A-Z]+ ){3}') self.re_lower = re.compile('(^.+?) (.[^ ]?[a-z].*$)') self.re_by = re.compile('(^.+) by (.+?)$', re.DOTALL) self.re_autinline = \ re.compile('^([A-Z][^A-Z]+( [^A-Z]+){3}.*?)([A-Z][.a-z].*)') self.re_word_end = re.compile(' [^A-Z]+\s*$') self.re_term = re.compile('(^|\s*[:,;]+)\s*(.+?)\s*([:,;]+\s*|$)') #Dictionaries try: self.rrsdict_locations = RRSDictionary(COUNTRIES, FIRST_UPPER) self.rrsdict_locations.extend(RRSDictionary(CITIES, FIRST_UPPER)) except RRSDictionaryError: raise DictionaryError("Failed to load dictionaries.")
class MetaExtractor(object): """ This class contains functions for basic metadata searching in articles. """ def __init__(self): #patterns self._pat_abstract_1 = '(Abstract|ABSTRACT)' self._pat_abstract_2 = '(In this|This (paper|study|article|report)' self._pat_abstract_2 += '|IN THIS|THIS (PAPER|STUDY|ARTICLE|REPORT))' self._pat_abstract_end = '(!A_E!|$|\n\s*\n)' self._pat_keywords = 'I[nN][dD][eE][xX] ?[tT][eE][rR][mM][sS]|' self._pat_keywords += 'K[eE][yY] ?[wW][oO][rR][dD][sS](?: and [A-Za-z]*?)?|' self._pat_keywords += 'G[eE][nN][eE][rR][aA][lL] ?[tT][eE][rR][mM][sS]?' self.pat_months = 'janu?a?r?y?\.?|febr?u?a?r?y?\.?|marc?h?\.?|apri?l?\.?|' self.pat_months += 'may|june?\.?|july?\.?|augu?s?t?\.?|sept?e?m?b?e?r?\.?|' self.pat_months += 'octo?b?e?r?\.?|nove?m?b?e?r?\.?|dece?m?b?e?r?\.?' self.pat_junctions = 'the|be|a|an|anthe|of|in|at|for|from|to|into|and|or|with' self.pat_middle_name = '(ter|Ter|van|den|der|de|di|la|van der|von|chen|' self.pat_middle_name += 'van de|van den|Van|Den|Der|De|Di|La|Van der|Von|' self.pat_middle_name += 'Chen|Van de|Van den|el|El)' self.alone_name = \ '([eE]t[. ]*?al\.?|[Ss]ons?|[Jj]r[ .,]|[Jj]unior|[eE]tc\.?)' self.roman_num = '(I[ .]|II[ .]|III[ .]|IV[ .]|V[ .]|VI[ .]|VII[ .]|' self.roman_num += 'VIII[ .]|IX[ .]|X[ .])' #regular expressions self._re_abstract_1 = re.compile(self._pat_abstract_1 + '\W+(.+?)' + self._pat_abstract_end, re.DOTALL) self._re_abstract_2 = re.compile(self._pat_abstract_2 + '\W+(.+?)' + self._pat_abstract_end, re.DOTALL) self._re_keywords_1 = re.compile('(' + self._pat_keywords + ')[-:,;. ]*?\n+(.+?)\n') self._re_keywords_2 = re.compile('(' + self._pat_keywords + ')[-:,;. ]+(.+?)(\n|[^0-9A-Z]\.|$)') self._re_meta = re.compile('(^.*)(authors?:|emails?:|editors?:)', re.I) self.re_published = re.compile('(^|\s)published (in|with)', re.I) self.re_issn = re.compile('ISSN|[0-9]{4}-[0-9]{4}') self.re_rep = re.compile('(^\s*[a-z]{2,} report\s*$|tech report)', re.I) self.re_lower_start = re.compile('^[a-z][^A-Z-\'`]') self.re_vol = re.compile('vol(ume)?\.? *[0-9]+', re.I) self.re_rev = re.compile('[Rr]evision *[0-9]+') self.re_inproc = \ re.compile('in proc\.|in proceedings|proceedings of', re.I) self.re_no = re.compile('n(o|umber)\.? *[0-9]+', re.I) self.re_pages = re.compile('(pages?|p\.) [0-9]', re.I) self.re_etal = re.compile('et al\.?(\s|$)', re.I) self.re_copyright = re.compile('(^|\s)copyright(\s|$)', re.I) self.re_date = re.compile('(' + self.pat_months + ') .*?[0-9]{4}', re.I) self.re_year = re.compile('\([0-9]{4}\)') self.re_num = re.compile('^[0-9\s%]+$', re.DOTALL) self.re_noletter = re.compile('^[^a-z]+$', re.IGNORECASE) self.re_upper_word = re.compile('[A-Z][A-Z]+') self.re_lower_word = re.compile('[a-z]') self.re_telfax = re.compile('(tel|fax) +[+]?[0-9][-0-9 ()]', re.I) self.re_empty = re.compile('^\s*$', re.DOTALL) self.re_notitle = re.compile('\s(conference|symposium)\s', re.M | re.I) self.re_title = re.compile('[A-Z][A-Z]+( [A-Z][A-Z])* [0-9]{4}') self.re_mark = \ re.compile('(^[^\s]+ ?(/|-[0-9]) ?[^\s]+$|[0-9]+[-:][0-9]+)') self.re_type = \ re.compile('(^|\s)thesis|article|journal(\s|$)', re.I | re.DOTALL) self.re_organization = \ re.compile('department of|university|school of', re.IGNORECASE) self.re_press = \ re.compile('(^|[^0-9a-z])in press([^0-9a-z]|$)', re.IGNORECASE) self.re_domain = re.compile('.+\.[a-z]{2,3}$', re.IGNORECASE) self.re_zav = re.compile('^[[(].*[])]$') self.re_title = re.compile('^.*?T[Ii][Tt][Ll][Ee]:\s*(.+?)(\.|$)') self.re_authors = re.compile('^[ ,.&]*(' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(' #Jmeno von Prijmeni '(([A-Z][-A-Za-z\'´`]+)[.,]?[ ]+)' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(([A-Z][-A-Za-z\'´`]+)[ ]*)' '|' #Prijmeni, J. '(([A-Z][-A-Za-z\'´`]+[.,]?)[ ]+)' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '((' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z][ .]?)[ ]*)+?' '|' #J. von Prijmeni '((' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)+?' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(([A-Z][-A-Za-z\'´`]+)[ ]*)' ')' '(([A-Z]v+|' + self.pat_middle_name + '|([A-Z]\.?-)?[A-Z]\.)[ ]*)*?' '(([., ]*(' + self.alone_name + '|' + self.roman_num + ')?)*' '((?<=\.) |&|[,. ]?and[,. ]|,|\.|;|$|%SEP%|\())+?' '([., ]*' + self.alone_name + ')*)', re.VERBOSE) self.re_inc = re.compile('[^a-zA-Z]([A-Z]\.|' + self.pat_middle_name + '|' + self.alone_name + '|&)[^a-zA-Z]') self.re_end = re.compile('([-:,]| (' + self.pat_junctions + '))\s*\n', re.IGNORECASE) self.re_split = \ re.compile('^[A-Z]+( [^ a-z]+)*\n([^ a-z]+( [^ a-z]+)*\n)+') self.re_upper = re.compile('([A-Z][A-Z]+ ){3}') self.re_lower = re.compile('(^.+?) (.[^ ]?[a-z].*$)') self.re_by = re.compile('(^.+) by (.+?)$', re.DOTALL) self.re_autinline = \ re.compile('^([A-Z][^A-Z]+( [^A-Z]+){3}.*?)([A-Z][.a-z].*)') self.re_word_end = re.compile(' [^A-Z]+\s*$') self.re_term = re.compile('(^|\s*[:,;]+)\s*(.+?)\s*([:,;]+\s*|$)') #Dictionaries try: self.rrsdict_locations = RRSDictionary(COUNTRIES, FIRST_UPPER) self.rrsdict_locations.extend(RRSDictionary(CITIES, FIRST_UPPER)) except RRSDictionaryError: raise DictionaryError("Failed to load dictionaries.") def _is_title_impossible(self, line, check_lower=False): if len(line) > 300: return True elif self.re_empty.search(line) or self.re_notitle.search(line) or \ self.re_title.search(line) or self.re_rep.search(line) or \ self.re_issn.search(line) or self._re_meta.search(line) or \ self.re_num.search(line) or self.re_lower_start.search(line) or \ self.re_telfax.search(line) or self.re_vol.search(line) or \ self.re_no.search(line) or self.re_date.search(line) or \ self.re_year.search(line) or self.re_pages.search(line) or \ self.re_inproc.search(line) or self.re_type.search(line) or \ self.re_etal.search(line) or self.re_copyright.search(line) or \ self.re_published.search(line) or self.re_mark.search(line) or \ self.re_organization.search(line) or self.re_press.search(line) or \ self.re_domain.search(line) or self.re_zav.search(line) or \ self.re_noletter.search(line) or self.re_rev.search(line): return True if check_lower and self.re_upper_word.search(line) and \ self.re_lower_word.search(line): word = self.re_upper_word.search(line).group(0) if self.rrsdict_locations.contains_key(word): return True return False def find_title(self, meta_text): """ This method finds article title in text. @param meta_text: part of the text containing metadata @type meta_text: str @return: returns tuple - title and the rest of the text @rtype: (str, str) """ title = "" meta_text_orig = meta_text while self.re_end.search(meta_text): meta_text = self.re_end.sub("\g<1> ", meta_text) #Opravi rozdeleny nazev psany velkymi pismeny if self.re_split.search(meta_text): orig_txt = self.re_split.search(meta_text).group(0) new_txt = orig_txt.replace("\n", " ") new_txt = re.sub(" +", " ", new_txt) + "\n" meta_text = re.sub('' + re.escape(orig_txt) + '', new_txt, meta_text) #Ziska vsechny radky z hlavicky lines = meta_text.splitlines() #Prochazi postupne jednotlive radky a pokud v jednom najde vice moznych #metainformaci, tak je rozdeli for line in lines: if self._is_title_impossible(line, True): continue if self.re_upper.search(line) and self.re_lower.search(line): while self.re_upper.search(line) and self.re_lower.search(line): new_line = self.re_lower.search(line).group(1) + "\n" \ + self.re_lower.search(line).group(2) new_line = re.sub('(\s|\n|' + re.escape('\\') + ')+$', "", new_line) meta_text = re.sub('' + re.escape(line) + '', new_line, meta_text) line = self.re_lower.search(line).group(2) elif self.re_autinline.search(line) and \ not self.re_word_end.search(line): while self.re_autinline.search(line) and \ not self.re_word_end.search(line): new_line = self.re_autinline.search(line).group(1) + "\n" \ + self.re_autinline.search(line).group(2) new_line = re.sub('(\s|\n|' + re.escape('\\') + ')+$', "", new_line) meta_text = re.sub('' + re.escape(line) + '', new_line, meta_text) line = self.re_autinline.search(line).group(2) elif self.re_by.search(line): line_groups = self.re_by.search(line) change = False try: if self.re_inc.search(line_groups.group(2)) and \ self.re_authors.search(line_groups.group(2)): change = True except MemoryError: change = True if change: new_line = line_groups.group(1) + "\n" + line_groups.group(2) new_line = re.sub('(\s|\n|' + re.escape('\\') + ')+$', "", new_line) meta_text = re.sub('' + re.escape(line) + '', new_line, meta_text) #Znovu ziskame vsechny radky v hlavicce lines = meta_text.splitlines() possible_titles = [] #Postupne projdeme vsechny radky a ulozime ty, ktere by mohly byt nazvem for line in lines: #Radek, ktery obsahuje text "Title:" je vyhodnocen jako nazev if self.re_title.search(line): title = self.re_title.search(line).group(2) if self._re_meta.search(title): title = self._re_meta.search(title).group(1) break try: if self.re_inc.search(line) and self.re_authors.search(line): continue except MemoryError: continue if self._is_title_impossible(line) == True: continue #Radek splnil pozadavky possible_titles.append(line) if title == "": title = None for t in possible_titles: title = t if len(t.split(" ")) <= 1: continue break return (title, meta_text_orig) def find_abstract(self, meta_text): """ This method finds article abstract in text. @param meta_text: part of the text containing metadata @type meta_text: str @return: returns tuple - abstract and the rest of the text @rtype: (str, str) """ abstract = None if self._re_abstract_1.search(meta_text): abstract_gr = self._re_abstract_1.search(meta_text) meta_text = re.sub(re.escape(abstract_gr.group(0)), "", meta_text) abstract = abstract_gr.group(2) elif self._re_abstract_2.search(meta_text): abstract_gr = self._re_abstract_2.search(meta_text) meta_text = re.sub(re.escape(abstract_gr.group(0)), "", meta_text) abstract = abstract_gr.group(0) if abstract != None: if re.search('[a-z]', abstract, re.I) == None: abstract = None else: abstract = abstract.replace("\n", " ") return (abstract, meta_text) def find_keywords(self, meta_txt): """ This method finds article keywords in text. @param meta_text: part of the text containing metadata @type meta_text: str @return: returns tuple - keywords and the rest of the text @rtype: ([RRSKeyword], str) """ keywords = [] keywords_text = "" while self._re_keywords_1.search(meta_txt) \ or self._re_keywords_2.search(meta_txt): if self._re_keywords_1.search(meta_txt): keywords_all = self._re_keywords_1.search(meta_txt) keywords_text = keywords_all.group(2) meta_txt = re.sub(re.escape(keywords_all.group(0)), "", meta_txt) elif self._re_keywords_2.search(meta_txt): keywords_all = self._re_keywords_2.search(meta_txt) keywords_text = keywords_all.group(2) if keywords_all.group(3) != "\n": keywords_text += keywords_all.group(3).replace(".", "") meta_txt = re.sub(re.escape(keywords_all.group(0)), "", meta_txt) while re.search(self._pat_keywords, keywords_text): keywords_text = re.sub(self._pat_keywords, ", ", keywords_text) if keywords_text == "": return (keywords, meta_txt) c = 1 is_upper = False while self.re_term.search(keywords_text): term = self.re_term.search(keywords_text) if re.search('[A-Z]', term.group(2)): if not is_upper and c > 3: meta_txt += " !A_E! " + keywords_text break else: is_upper = True keywords.append(RRSKeyword(title=term.group(2))) keywords_text = re.sub(re.escape(term.group(0)), "", keywords_text) c += 1 return (keywords, meta_txt)
def __init__(self): self.pat_month = "january|february|march|april|may|june|july|august|" self.pat_month += "september|october|november|december|jan\.?|feb\.?|" self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|" self.pat_month += "nov\.?|dec\.?" self.pat_date = "(" + self.pat_month + \ " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})" self.kws_unpublished = { "introduction": 50, "abstract": 50, "related work": 50 } self.kws_article = { "(vol\.?|volume)\s*[0-9]+": 20, self.pat_date: 20, "(pages?|pp?\.?)\s*[-0-9]+(?!\))": 20, "(number|no\.?)\s*[_0-9]+": 20, "copyright": 20, "all rights reserved": 20, "journal": 20, "is published": 20, "published in": 20, "first published": 20, "in press": 20, "introduction": 50, "abstract": 50, "related work": 50, self._find_events: 50, self._find_events_2: 20 } self.kws_techreport = { "this report": 200, "tech[a-z]+ report": 200, "summary report": 100, "is (a )?report": 80 } self.kws_phdthesis = { "supervisor": 100, "this thesis": 200, "dissertation": 200, "Ph\.?D thesis": 200 } self.kws_masterthesis = { "supervisor": 100, "this thesis": 200, "master thesis": 200, "master\W?s thesis": 200 } self.types = { UNPUBLISHED: self.kws_unpublished, ARTICLE: self.kws_article, PHDTHESIS: self.kws_phdthesis, MASTERTHESIS: self.kws_masterthesis, TECHREPORT: self.kws_techreport } self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL) self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE) self.re_oo_impress = re.compile('impress', re.IGNORECASE) self.re_pages = re.compile('Pages:\s*([0-9]+)') self.pat_chapters = "R(eferences|EFERENCES)" self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W', re.DOTALL) #Proceedings articles patterns and RE self.pat_time = "[0-2]?[0-9]:[0-9][0-9]" self.pat_toc_page = "(\.\s*){2,}[0-9]+\n" self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)" self.pat_roman_nums = self.pat_pagesep + \ "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \ self.pat_pagesep self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \ self.pat_pagesep + ")" self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)" self.re_proceedings_prefix_strict = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end_strict + "|" + \ self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE) self.re_proceedings_prefix = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \ "))", re.DOTALL | re.IGNORECASE) self.re_abstract = \ re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n", re.DOTALL | re.IGNORECASE) self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)", re.DOTALL | re.IGNORECASE) self.re_multi_strict = \ re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \ self.pat_page_end_strict + ')', re.DOTALL) self.re_previous_1 = \ re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \ '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \ '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \ self.pat_page_end + ')', re.DOTALL) self.re_previous_2 = \ re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \ 'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \ 'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])', re.DOTALL) self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)
class DocumentInfo(object): """ This class collects information about PDF documents. """ def __init__(self): self.pat_month = "january|february|march|april|may|june|july|august|" self.pat_month += "september|october|november|december|jan\.?|feb\.?|" self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|" self.pat_month += "nov\.?|dec\.?" self.pat_date = "(" + self.pat_month + \ " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})" self.kws_unpublished = { "introduction": 50, "abstract": 50, "related work": 50 } self.kws_article = { "(vol\.?|volume)\s*[0-9]+": 20, self.pat_date: 20, "(pages?|pp?\.?)\s*[-0-9]+(?!\))": 20, "(number|no\.?)\s*[_0-9]+": 20, "copyright": 20, "all rights reserved": 20, "journal": 20, "is published": 20, "published in": 20, "first published": 20, "in press": 20, "introduction": 50, "abstract": 50, "related work": 50, self._find_events: 50, self._find_events_2: 20 } self.kws_techreport = { "this report": 200, "tech[a-z]+ report": 200, "summary report": 100, "is (a )?report": 80 } self.kws_phdthesis = { "supervisor": 100, "this thesis": 200, "dissertation": 200, "Ph\.?D thesis": 200 } self.kws_masterthesis = { "supervisor": 100, "this thesis": 200, "master thesis": 200, "master\W?s thesis": 200 } self.types = { UNPUBLISHED: self.kws_unpublished, ARTICLE: self.kws_article, PHDTHESIS: self.kws_phdthesis, MASTERTHESIS: self.kws_masterthesis, TECHREPORT: self.kws_techreport } self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL) self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE) self.re_oo_impress = re.compile('impress', re.IGNORECASE) self.re_pages = re.compile('Pages:\s*([0-9]+)') self.pat_chapters = "R(eferences|EFERENCES)" self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W', re.DOTALL) #Proceedings articles patterns and RE self.pat_time = "[0-2]?[0-9]:[0-9][0-9]" self.pat_toc_page = "(\.\s*){2,}[0-9]+\n" self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)" self.pat_roman_nums = self.pat_pagesep + \ "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \ self.pat_pagesep self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \ self.pat_pagesep + ")" self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)" self.re_proceedings_prefix_strict = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end_strict + "|" + \ self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE) self.re_proceedings_prefix = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \ "))", re.DOTALL | re.IGNORECASE) self.re_abstract = \ re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n", re.DOTALL | re.IGNORECASE) self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)", re.DOTALL | re.IGNORECASE) self.re_multi_strict = \ re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \ self.pat_page_end_strict + ')', re.DOTALL) self.re_previous_1 = \ re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \ '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \ '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \ self.pat_page_end + ')', re.DOTALL) self.re_previous_2 = \ re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \ 'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \ 'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])', re.DOTALL) self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE) def _find_events(self, text): events = self.rrsdict_events.text_search(text, False, RET_ORIG_TERM) if len(events) > 0: return True else: return False def _find_events_2(self, text): if re.search("[0-9A-Z]+(/[0-9A-Z]+)+", text): return True else: return False def _is_proceedings(self, text): if self.re_proceedings.search(text): return True else: return False def _is_presentation(self, pdfinfo): if self.re_ms_powerpoint.search(pdfinfo): return True else: return False def _is_poster(self, pdfinfo): if self.re_pages.search(pdfinfo): pnum = int(self.re_pages.search(pdfinfo).group(1)) if pnum == 1: return True else: return False else: return False def _shrink_text(self, text): if self.re_chapters.search(text): text = self.re_chapters.search(text).group(1) return text def get_pdfinfo(self, pdf_file_path): """ Returns pdf metadata using pdfinfo program. """ return commands.getoutput('pdfinfo ' + pdf_file_path) def get_document_type(self, text_file_path, pdf_file_path=None): """ Main method. Returns type of specified document. """ if pdf_file_path != None: pdfinfo = self.get_pdfinfo(pdf_file_path) if self._is_presentation(pdfinfo): return PRESENTATION elif self._is_poster(pdfinfo): return POSTER text_full = open(text_file_path, 'r').read() offset = 5000 text_orig = text_full[0:offset] text_orig = self._shrink_text(text_orig) text = text_orig.lower() points = {} for type in self.types.keys(): points[type] = [0, 0] for kw in self.types[type].keys(): if isinstance(kw, str): if re.search("(^|\W)" + kw + "(\W|$)", text, re.DOTALL): points[type][0] += int(self.types[type][kw]) points[type][1] += 1 else: if kw(text_orig): points[type][0] += int(self.types[type][kw]) points[type][1] += 1 final_type = (MISC, 0) for type in points.keys(): score = points[type][0] if score > final_type[1]: final_type = (type, points[type][0]) if final_type[0] == ARTICLE and points[UNPUBLISHED][0] != 0: if final_type[1] - points[UNPUBLISHED][0] <= 20: final_type = (UNPUBLISHED, points[UNPUBLISHED][0]) if final_type[0] == ARTICLE or final_type[0] == UNPUBLISHED: if self._is_proceedings(text): final_type = (INPROCEEDINGS, final_type[1]) return final_type[0] def get_articles(self, text): """ In case of proceedings, this method returns list of contained articles. """ articles = [] if self.re_abstract.search(text): reduced_text = self.re_abstract.search(text).group(1) else: reduced_text = text if self.re_proceedings_prefix_strict.search(reduced_text): groups = self.re_proceedings_prefix_strict.search(reduced_text) text = re.sub(re.escape(groups.group(1)), "", text) elif self.re_proceedings_prefix.search(reduced_text): groups = self.re_proceedings_prefix.search(reduced_text) text = re.sub(re.escape(groups.group(1)), "", text) elif self.re_proceedings.search(reduced_text): groups = self.re_proceedings.search(reduced_text) text = re.sub(re.escape(groups.group(1)), "", text) while self.re_multi_strict.search(text): article = self.re_multi_strict.search(text).group(1) text = re.sub(re.escape(article), "", text) while self.re_previous_1.search(article): previous = self.re_previous_1.search(article).group(1) article = re.sub(re.escape(previous), "", article) if len(articles) > 0: articles[len(articles) - 1] += "\n" + previous if self.re_previous_2.search(article) and len(articles) > 0: articles[len(articles) - 1] += "\n\n" + article else: articles.append(article) if len(articles) == 1: articles = [] return articles
def __init__(self): self.pat_month = "january|february|march|april|may|june|july|august|" self.pat_month += "september|october|november|december|jan\.?|feb\.?|" self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|" self.pat_month += "nov\.?|dec\.?" self.pat_date = "(" + self.pat_month + \ " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})" self.kws_unpublished = {"introduction":50, "abstract":50, "related work":50} self.kws_article = {"(vol\.?|volume)\s*[0-9]+":20, self.pat_date:20, "(pages?|pp?\.?)\s*[-0-9]+(?!\))":20, "(number|no\.?)\s*[_0-9]+":20, "copyright":20, "all rights reserved":20, "journal":20, "is published":20, "published in":20, "first published":20, "in press":20, "introduction":50, "abstract":50, "related work":50, self._find_events:50, self._find_events_2:20} self.kws_techreport = {"this report":200, "tech[a-z]+ report":200, "summary report":100, "is (a )?report":80} self.kws_phdthesis = {"supervisor":100, "this thesis":200, "dissertation":200, "Ph\.?D thesis":200} self.kws_masterthesis = {"supervisor":100, "this thesis":200, "master thesis":200, "master\W?s thesis":200} self.types = {UNPUBLISHED:self.kws_unpublished, ARTICLE:self.kws_article, PHDTHESIS:self.kws_phdthesis, MASTERTHESIS:self.kws_masterthesis, TECHREPORT:self.kws_techreport} self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL) self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE) self.re_oo_impress = re.compile('impress', re.IGNORECASE) self.re_pages = re.compile('Pages:\s*([0-9]+)') self.pat_chapters = "R(eferences|EFERENCES)" self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W', re.DOTALL) #Proceedings articles patterns and RE self.pat_time = "[0-2]?[0-9]:[0-9][0-9]" self.pat_toc_page = "(\.\s*){2,}[0-9]+\n" self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)" self.pat_roman_nums = self.pat_pagesep + \ "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \ self.pat_pagesep self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \ self.pat_pagesep + ")" self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)" self.re_proceedings_prefix_strict = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end_strict + "|" + \ self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE) self.re_proceedings_prefix = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \ "))", re.DOTALL | re.IGNORECASE) self.re_abstract = \ re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n", re.DOTALL | re.IGNORECASE) self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)", re.DOTALL | re.IGNORECASE) self.re_multi_strict = \ re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \ self.pat_page_end_strict + ')', re.DOTALL) self.re_previous_1 = \ re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \ '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \ '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \ self.pat_page_end + ')', re.DOTALL) self.re_previous_2 = \ re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \ 'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \ 'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])', re.DOTALL) self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)
class DocumentInfo(object): """ This class collects information about PDF documents. """ def __init__(self): self.pat_month = "january|february|march|april|may|june|july|august|" self.pat_month += "september|october|november|december|jan\.?|feb\.?|" self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|" self.pat_month += "nov\.?|dec\.?" self.pat_date = "(" + self.pat_month + \ " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})" self.kws_unpublished = {"introduction":50, "abstract":50, "related work":50} self.kws_article = {"(vol\.?|volume)\s*[0-9]+":20, self.pat_date:20, "(pages?|pp?\.?)\s*[-0-9]+(?!\))":20, "(number|no\.?)\s*[_0-9]+":20, "copyright":20, "all rights reserved":20, "journal":20, "is published":20, "published in":20, "first published":20, "in press":20, "introduction":50, "abstract":50, "related work":50, self._find_events:50, self._find_events_2:20} self.kws_techreport = {"this report":200, "tech[a-z]+ report":200, "summary report":100, "is (a )?report":80} self.kws_phdthesis = {"supervisor":100, "this thesis":200, "dissertation":200, "Ph\.?D thesis":200} self.kws_masterthesis = {"supervisor":100, "this thesis":200, "master thesis":200, "master\W?s thesis":200} self.types = {UNPUBLISHED:self.kws_unpublished, ARTICLE:self.kws_article, PHDTHESIS:self.kws_phdthesis, MASTERTHESIS:self.kws_masterthesis, TECHREPORT:self.kws_techreport} self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL) self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE) self.re_oo_impress = re.compile('impress', re.IGNORECASE) self.re_pages = re.compile('Pages:\s*([0-9]+)') self.pat_chapters = "R(eferences|EFERENCES)" self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W', re.DOTALL) #Proceedings articles patterns and RE self.pat_time = "[0-2]?[0-9]:[0-9][0-9]" self.pat_toc_page = "(\.\s*){2,}[0-9]+\n" self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)" self.pat_roman_nums = self.pat_pagesep + \ "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \ self.pat_pagesep self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \ self.pat_pagesep + ")" self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)" self.re_proceedings_prefix_strict = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end_strict + "|" + \ self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE) self.re_proceedings_prefix = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \ "))", re.DOTALL | re.IGNORECASE) self.re_abstract = \ re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n", re.DOTALL | re.IGNORECASE) self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)", re.DOTALL | re.IGNORECASE) self.re_multi_strict = \ re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \ self.pat_page_end_strict + ')', re.DOTALL) self.re_previous_1 = \ re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \ '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \ '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \ self.pat_page_end + ')', re.DOTALL) self.re_previous_2 = \ re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \ 'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \ 'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])', re.DOTALL) self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE) def _find_events(self, text): events = self.rrsdict_events.text_search(text, False, RET_ORIG_TERM) if len(events) > 0: return True else: return False def _find_events_2(self, text): if re.search("[0-9A-Z]+(/[0-9A-Z]+)+", text): return True else: return False def _is_proceedings(self, text): if self.re_proceedings.search(text): return True else: return False def _is_presentation(self, pdfinfo): if self.re_ms_powerpoint.search(pdfinfo): return True else: return False def _is_poster(self, pdfinfo): if self.re_pages.search(pdfinfo): pnum = int(self.re_pages.search(pdfinfo).group(1)) if pnum == 1: return True else: return False else: return False def _shrink_text(self, text): if self.re_chapters.search(text): text = self.re_chapters.search(text).group(1) return text def get_pdfinfo(self, pdf_file_path): """ Returns pdf metadata using pdfinfo program. """ return commands.getoutput('pdfinfo ' + pdf_file_path) def get_document_type(self, text_file_path, pdf_file_path=None): """ Main method. Returns type of specified document. """ if pdf_file_path != None: pdfinfo = self.get_pdfinfo(pdf_file_path) if self._is_presentation(pdfinfo): return PRESENTATION elif self._is_poster(pdfinfo): return POSTER text_full = open(text_file_path, 'r').read() offset = 5000 text_orig = text_full[0:offset] text_orig = self._shrink_text(text_orig) text = text_orig.lower() points = {} for type in self.types.keys(): points[type] = [0, 0] for kw in self.types[type].keys(): if isinstance(kw, str): if re.search("(^|\W)" + kw + "(\W|$)", text, re.DOTALL): points[type][0] += int(self.types[type][kw]) points[type][1] += 1 else: if kw(text_orig): points[type][0] += int(self.types[type][kw]) points[type][1] += 1 final_type = (MISC, 0) for type in points.keys(): score = points[type][0] if score > final_type[1]: final_type = (type, points[type][0]) if final_type[0] == ARTICLE and points[UNPUBLISHED][0] != 0: if final_type[1] - points[UNPUBLISHED][0] <= 20: final_type = (UNPUBLISHED, points[UNPUBLISHED][0]) if final_type[0] == ARTICLE or final_type[0] == UNPUBLISHED: if self._is_proceedings(text): final_type = (INPROCEEDINGS, final_type[1]) return final_type[0] def get_articles(self, text): """ In case of proceedings, this method returns list of contained articles. """ articles = [] if self.re_abstract.search(text): reduced_text = self.re_abstract.search(text).group(1) else: reduced_text = text if self.re_proceedings_prefix_strict.search(reduced_text): groups = self.re_proceedings_prefix_strict.search(reduced_text) text = re.sub(re.escape(groups.group(1)), "", text) elif self.re_proceedings_prefix.search(reduced_text): groups = self.re_proceedings_prefix.search(reduced_text) text = re.sub(re.escape(groups.group(1)), "", text) elif self.re_proceedings.search(reduced_text): groups = self.re_proceedings.search(reduced_text) text = re.sub(re.escape(groups.group(1)), "", text) while self.re_multi_strict.search(text): article = self.re_multi_strict.search(text).group(1) text = re.sub(re.escape(article), "", text) while self.re_previous_1.search(article): previous = self.re_previous_1.search(article).group(1) article = re.sub(re.escape(previous), "", article) if len(articles) > 0: articles[len(articles) - 1] += "\n" + previous if self.re_previous_2.search(article)and len(articles) > 0: articles[len(articles) - 1] += "\n\n" + article else: articles.append(article) if len(articles) == 1: articles = [] return articles