def __init__(self):
        #patterns
        self._pat_abstract_1 = '(Abstract|ABSTRACT)'
        self._pat_abstract_2 = '(In this|This (paper|study|article|report)'
        self._pat_abstract_2 += '|IN THIS|THIS (PAPER|STUDY|ARTICLE|REPORT))'
        self._pat_abstract_end = '(!A_E!|$|\n\s*\n)'
        self._pat_keywords = 'I[nN][dD][eE][xX] ?[tT][eE][rR][mM][sS]|'
        self._pat_keywords += 'K[eE][yY] ?[wW][oO][rR][dD][sS](?: and [A-Za-z]*?)?|'
        self._pat_keywords += 'G[eE][nN][eE][rR][aA][lL] ?[tT][eE][rR][mM][sS]?'
        self.pat_months = 'janu?a?r?y?\.?|febr?u?a?r?y?\.?|marc?h?\.?|apri?l?\.?|'
        self.pat_months += 'may|june?\.?|july?\.?|augu?s?t?\.?|sept?e?m?b?e?r?\.?|'
        self.pat_months += 'octo?b?e?r?\.?|nove?m?b?e?r?\.?|dece?m?b?e?r?\.?'
        self.pat_junctions = 'the|be|a|an|anthe|of|in|at|for|from|to|into|and|or|with'
        self.pat_middle_name = '(ter|Ter|van|den|der|de|di|la|van der|von|chen|'
        self.pat_middle_name += 'van de|van den|Van|Den|Der|De|Di|La|Van der|Von|'
        self.pat_middle_name += 'Chen|Van de|Van den|el|El)'
        self.alone_name = \
            '([eE]t[. ]*?al\.?|[Ss]ons?|[Jj]r[ .,]|[Jj]unior|[eE]tc\.?)'
        self.roman_num = '(I[ .]|II[ .]|III[ .]|IV[ .]|V[ .]|VI[ .]|VII[ .]|'
        self.roman_num += 'VIII[ .]|IX[ .]|X[ .])'

        #regular expressions
        self._re_abstract_1 = re.compile(self._pat_abstract_1 + '\W+(.+?)'
                                       + self._pat_abstract_end, re.DOTALL)
        self._re_abstract_2 = re.compile(self._pat_abstract_2 + '\W+(.+?)'
                                       + self._pat_abstract_end, re.DOTALL)
        self._re_keywords_1 = re.compile('(' + self._pat_keywords
                                         + ')[-:,;. ]*?\n+(.+?)\n')
        self._re_keywords_2 = re.compile('(' + self._pat_keywords
                                         + ')[-:,;. ]+(.+?)(\n|[^0-9A-Z]\.|$)')
        self._re_meta = re.compile('(^.*)(authors?:|emails?:|editors?:)', re.I)
        self.re_published = re.compile('(^|\s)published (in|with)', re.I)
        self.re_issn = re.compile('ISSN|[0-9]{4}-[0-9]{4}')
        self.re_rep = re.compile('(^\s*[a-z]{2,} report\s*$|tech report)', re.I)
        self.re_lower_start = re.compile('^[a-z][^A-Z-\'`]')
        self.re_vol = re.compile('vol(ume)?\.? *[0-9]+', re.I)
        self.re_rev = re.compile('[Rr]evision *[0-9]+')
        self.re_inproc = \
            re.compile('in proc\.|in proceedings|proceedings of', re.I)
        self.re_no = re.compile('n(o|umber)\.? *[0-9]+', re.I)
        self.re_pages = re.compile('(pages?|p\.) [0-9]', re.I)
        self.re_etal = re.compile('et al\.?(\s|$)', re.I)
        self.re_copyright = re.compile('(^|\s)copyright(\s|$)', re.I)
        self.re_date = re.compile('(' + self.pat_months + ') .*?[0-9]{4}', re.I)
        self.re_year = re.compile('\([0-9]{4}\)')
        self.re_num = re.compile('^[0-9\s%]+$', re.DOTALL)
        self.re_noletter = re.compile('^[^a-z]+$', re.IGNORECASE)
        self.re_upper_word = re.compile('[A-Z][A-Z]+')
        self.re_lower_word = re.compile('[a-z]')
        self.re_telfax = re.compile('(tel|fax) +[+]?[0-9][-0-9 ()]', re.I)
        self.re_empty = re.compile('^\s*$', re.DOTALL)
        self.re_notitle = re.compile('\s(conference|symposium)\s', re.M | re.I)
        self.re_title = re.compile('[A-Z][A-Z]+( [A-Z][A-Z])* [0-9]{4}')
        self.re_mark = \
            re.compile('(^[^\s]+ ?(/|-[0-9]) ?[^\s]+$|[0-9]+[-:][0-9]+)')
        self.re_type = \
            re.compile('(^|\s)thesis|article|journal(\s|$)', re.I | re.DOTALL)
        self.re_organization = \
            re.compile('department of|university|school of', re.IGNORECASE)
        self.re_press = \
            re.compile('(^|[^0-9a-z])in press([^0-9a-z]|$)', re.IGNORECASE)
        self.re_domain = re.compile('.+\.[a-z]{2,3}$', re.IGNORECASE)
        self.re_zav = re.compile('^[[(].*[])]$')
        self.re_title = re.compile('^.*?T[Ii][Tt][Ll][Ee]:\s*(.+?)(\.|$)')
        self.re_authors = re.compile('^[ ,.&]*('
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(' #Jmeno von Prijmeni
                    '(([A-Z][-A-Za-z\'´`]+)[.,]?[ ]+)'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(([A-Z][-A-Za-z\'´`]+)[ ]*)'
                    '|' #Prijmeni, J.
                    '(([A-Z][-A-Za-z\'´`]+[.,]?)[ ]+)'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '((' + self.pat_middle_name
                    + '[.,]?|([A-Z]\.?-)?[A-Z][ .]?)[ ]*)+?'
                    '|' #J. von Prijmeni
                    '((' + self.pat_middle_name
                    + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)+?'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(([A-Z][-A-Za-z\'´`]+)[ ]*)'
                    ')'
                    '(([A-Z]v+|' + self.pat_middle_name
                    + '|([A-Z]\.?-)?[A-Z]\.)[ ]*)*?'
                    '(([., ]*(' + self.alone_name + '|'
                    + self.roman_num + ')?)*'
                    '((?<=\.) |&|[,. ]?and[,. ]|,|\.|;|$|%SEP%|\())+?'
                    '([., ]*' + self.alone_name + ')*)', re.VERBOSE)
        self.re_inc = re.compile('[^a-zA-Z]([A-Z]\.|' + self.pat_middle_name
                                 + '|' + self.alone_name + '|&)[^a-zA-Z]')
        self.re_end = re.compile('([-:,]| (' + self.pat_junctions + '))\s*\n',
                            re.IGNORECASE)
        self.re_split = \
            re.compile('^[A-Z]+( [^ a-z]+)*\n([^ a-z]+( [^ a-z]+)*\n)+')
        self.re_upper = re.compile('([A-Z][A-Z]+ ){3}')
        self.re_lower = re.compile('(^.+?) (.[^ ]?[a-z].*$)')
        self.re_by = re.compile('(^.+) by (.+?)$', re.DOTALL)
        self.re_autinline = \
            re.compile('^([A-Z][^A-Z]+( [^A-Z]+){3}.*?)([A-Z][.a-z].*)')
        self.re_word_end = re.compile(' [^A-Z]+\s*$')
        self.re_term = re.compile('(^|\s*[:,;]+)\s*(.+?)\s*([:,;]+\s*|$)')

        #Dictionaries
        try:
            self.rrsdict_locations = RRSDictionary(COUNTRIES, FIRST_UPPER)
            self.rrsdict_locations.extend(RRSDictionary(CITIES, FIRST_UPPER))
        except RRSDictionaryError:
            raise DictionaryError("Failed to load dictionaries.")
class MetaExtractor(object):
    """
    This class contains functions for basic metadata searching in articles.
    """
    def __init__(self):
        #patterns
        self._pat_abstract_1 = '(Abstract|ABSTRACT)'
        self._pat_abstract_2 = '(In this|This (paper|study|article|report)'
        self._pat_abstract_2 += '|IN THIS|THIS (PAPER|STUDY|ARTICLE|REPORT))'
        self._pat_abstract_end = '(!A_E!|$|\n\s*\n)'
        self._pat_keywords = 'I[nN][dD][eE][xX] ?[tT][eE][rR][mM][sS]|'
        self._pat_keywords += 'K[eE][yY] ?[wW][oO][rR][dD][sS](?: and [A-Za-z]*?)?|'
        self._pat_keywords += 'G[eE][nN][eE][rR][aA][lL] ?[tT][eE][rR][mM][sS]?'
        self.pat_months = 'janu?a?r?y?\.?|febr?u?a?r?y?\.?|marc?h?\.?|apri?l?\.?|'
        self.pat_months += 'may|june?\.?|july?\.?|augu?s?t?\.?|sept?e?m?b?e?r?\.?|'
        self.pat_months += 'octo?b?e?r?\.?|nove?m?b?e?r?\.?|dece?m?b?e?r?\.?'
        self.pat_junctions = 'the|be|a|an|anthe|of|in|at|for|from|to|into|and|or|with'
        self.pat_middle_name = '(ter|Ter|van|den|der|de|di|la|van der|von|chen|'
        self.pat_middle_name += 'van de|van den|Van|Den|Der|De|Di|La|Van der|Von|'
        self.pat_middle_name += 'Chen|Van de|Van den|el|El)'
        self.alone_name = \
            '([eE]t[. ]*?al\.?|[Ss]ons?|[Jj]r[ .,]|[Jj]unior|[eE]tc\.?)'
        self.roman_num = '(I[ .]|II[ .]|III[ .]|IV[ .]|V[ .]|VI[ .]|VII[ .]|'
        self.roman_num += 'VIII[ .]|IX[ .]|X[ .])'

        #regular expressions
        self._re_abstract_1 = re.compile(self._pat_abstract_1 + '\W+(.+?)'
                                       + self._pat_abstract_end, re.DOTALL)
        self._re_abstract_2 = re.compile(self._pat_abstract_2 + '\W+(.+?)'
                                       + self._pat_abstract_end, re.DOTALL)
        self._re_keywords_1 = re.compile('(' + self._pat_keywords
                                         + ')[-:,;. ]*?\n+(.+?)\n')
        self._re_keywords_2 = re.compile('(' + self._pat_keywords
                                         + ')[-:,;. ]+(.+?)(\n|[^0-9A-Z]\.|$)')
        self._re_meta = re.compile('(^.*)(authors?:|emails?:|editors?:)', re.I)
        self.re_published = re.compile('(^|\s)published (in|with)', re.I)
        self.re_issn = re.compile('ISSN|[0-9]{4}-[0-9]{4}')
        self.re_rep = re.compile('(^\s*[a-z]{2,} report\s*$|tech report)', re.I)
        self.re_lower_start = re.compile('^[a-z][^A-Z-\'`]')
        self.re_vol = re.compile('vol(ume)?\.? *[0-9]+', re.I)
        self.re_rev = re.compile('[Rr]evision *[0-9]+')
        self.re_inproc = \
            re.compile('in proc\.|in proceedings|proceedings of', re.I)
        self.re_no = re.compile('n(o|umber)\.? *[0-9]+', re.I)
        self.re_pages = re.compile('(pages?|p\.) [0-9]', re.I)
        self.re_etal = re.compile('et al\.?(\s|$)', re.I)
        self.re_copyright = re.compile('(^|\s)copyright(\s|$)', re.I)
        self.re_date = re.compile('(' + self.pat_months + ') .*?[0-9]{4}', re.I)
        self.re_year = re.compile('\([0-9]{4}\)')
        self.re_num = re.compile('^[0-9\s%]+$', re.DOTALL)
        self.re_noletter = re.compile('^[^a-z]+$', re.IGNORECASE)
        self.re_upper_word = re.compile('[A-Z][A-Z]+')
        self.re_lower_word = re.compile('[a-z]')
        self.re_telfax = re.compile('(tel|fax) +[+]?[0-9][-0-9 ()]', re.I)
        self.re_empty = re.compile('^\s*$', re.DOTALL)
        self.re_notitle = re.compile('\s(conference|symposium)\s', re.M | re.I)
        self.re_title = re.compile('[A-Z][A-Z]+( [A-Z][A-Z])* [0-9]{4}')
        self.re_mark = \
            re.compile('(^[^\s]+ ?(/|-[0-9]) ?[^\s]+$|[0-9]+[-:][0-9]+)')
        self.re_type = \
            re.compile('(^|\s)thesis|article|journal(\s|$)', re.I | re.DOTALL)
        self.re_organization = \
            re.compile('department of|university|school of', re.IGNORECASE)
        self.re_press = \
            re.compile('(^|[^0-9a-z])in press([^0-9a-z]|$)', re.IGNORECASE)
        self.re_domain = re.compile('.+\.[a-z]{2,3}$', re.IGNORECASE)
        self.re_zav = re.compile('^[[(].*[])]$')
        self.re_title = re.compile('^.*?T[Ii][Tt][Ll][Ee]:\s*(.+?)(\.|$)')
        self.re_authors = re.compile('^[ ,.&]*('
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(' #Jmeno von Prijmeni
                    '(([A-Z][-A-Za-z\'´`]+)[.,]?[ ]+)'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(([A-Z][-A-Za-z\'´`]+)[ ]*)'
                    '|' #Prijmeni, J.
                    '(([A-Z][-A-Za-z\'´`]+[.,]?)[ ]+)'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '((' + self.pat_middle_name
                    + '[.,]?|([A-Z]\.?-)?[A-Z][ .]?)[ ]*)+?'
                    '|' #J. von Prijmeni
                    '((' + self.pat_middle_name
                    + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)+?'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(([A-Z][-A-Za-z\'´`]+)[ ]*)'
                    ')'
                    '(([A-Z]v+|' + self.pat_middle_name
                    + '|([A-Z]\.?-)?[A-Z]\.)[ ]*)*?'
                    '(([., ]*(' + self.alone_name + '|'
                    + self.roman_num + ')?)*'
                    '((?<=\.) |&|[,. ]?and[,. ]|,|\.|;|$|%SEP%|\())+?'
                    '([., ]*' + self.alone_name + ')*)', re.VERBOSE)
        self.re_inc = re.compile('[^a-zA-Z]([A-Z]\.|' + self.pat_middle_name
                                 + '|' + self.alone_name + '|&)[^a-zA-Z]')
        self.re_end = re.compile('([-:,]| (' + self.pat_junctions + '))\s*\n',
                            re.IGNORECASE)
        self.re_split = \
            re.compile('^[A-Z]+( [^ a-z]+)*\n([^ a-z]+( [^ a-z]+)*\n)+')
        self.re_upper = re.compile('([A-Z][A-Z]+ ){3}')
        self.re_lower = re.compile('(^.+?) (.[^ ]?[a-z].*$)')
        self.re_by = re.compile('(^.+) by (.+?)$', re.DOTALL)
        self.re_autinline = \
            re.compile('^([A-Z][^A-Z]+( [^A-Z]+){3}.*?)([A-Z][.a-z].*)')
        self.re_word_end = re.compile(' [^A-Z]+\s*$')
        self.re_term = re.compile('(^|\s*[:,;]+)\s*(.+?)\s*([:,;]+\s*|$)')

        #Dictionaries
        try:
            self.rrsdict_locations = RRSDictionary(COUNTRIES, FIRST_UPPER)
            self.rrsdict_locations.extend(RRSDictionary(CITIES, FIRST_UPPER))
        except RRSDictionaryError:
            raise DictionaryError("Failed to load dictionaries.")


    def _is_title_impossible(self, line, check_lower=False):
        if len(line) > 300:
            return True
        elif self.re_empty.search(line) or self.re_notitle.search(line) or \
        self.re_title.search(line) or self.re_rep.search(line) or \
        self.re_issn.search(line) or self._re_meta.search(line) or \
        self.re_num.search(line)  or self.re_lower_start.search(line) or \
        self.re_telfax.search(line) or self.re_vol.search(line) or \
        self.re_no.search(line) or self.re_date.search(line) or \
        self.re_year.search(line) or self.re_pages.search(line) or \
        self.re_inproc.search(line) or self.re_type.search(line) or \
        self.re_etal.search(line) or self.re_copyright.search(line) or \
        self.re_published.search(line) or self.re_mark.search(line) or \
        self.re_organization.search(line) or self.re_press.search(line) or \
        self.re_domain.search(line) or self.re_zav.search(line) or \
        self.re_noletter.search(line) or self.re_rev.search(line):
            return True
        if check_lower and self.re_upper_word.search(line) and \
        self.re_lower_word.search(line):
            word = self.re_upper_word.search(line).group(0)
            if self.rrsdict_locations.contains_key(word):
                return True
        return False


    def find_title(self, meta_text):
        """
        This method finds article title in text.
        
        @param meta_text: part of the text containing metadata
        @type meta_text: str
        @return: returns tuple - title and the rest of the text
        @rtype: (str, str)  
        """
        title = ""
        meta_text_orig = meta_text

        while self.re_end.search(meta_text):
            meta_text = self.re_end.sub("\g<1> ", meta_text)

        #Opravi rozdeleny nazev psany velkymi pismeny
        if self.re_split.search(meta_text):
            orig_txt = self.re_split.search(meta_text).group(0)
            new_txt = orig_txt.replace("\n", " ")
            new_txt = re.sub("  +", " ", new_txt) + "\n"
            meta_text = re.sub('' + re.escape(orig_txt) + '', new_txt,
                               meta_text)

        #Ziska vsechny radky z hlavicky
        lines = meta_text.splitlines()

        #Prochazi postupne jednotlive radky a pokud v jednom najde vice moznych
        #metainformaci, tak je rozdeli
        for line in lines:
            if self._is_title_impossible(line, True):
                continue
            if self.re_upper.search(line) and self.re_lower.search(line):
                while self.re_upper.search(line) and self.re_lower.search(line):
                    new_line = self.re_lower.search(line).group(1) + "\n" \
                        + self.re_lower.search(line).group(2)
                    new_line = re.sub('(\s|\n|' + re.escape('\\') + ')+$',
                                      "", new_line)
                    meta_text = re.sub('' + re.escape(line) + '', new_line,
                                       meta_text)
                    line = self.re_lower.search(line).group(2)
            elif self.re_autinline.search(line) and \
            not self.re_word_end.search(line):
                while self.re_autinline.search(line) and \
                not self.re_word_end.search(line):
                    new_line = self.re_autinline.search(line).group(1) + "\n" \
                        + self.re_autinline.search(line).group(2)
                    new_line = re.sub('(\s|\n|' + re.escape('\\') + ')+$', "",
                                      new_line)
                    meta_text = re.sub('' + re.escape(line) + '', new_line,
                                       meta_text)
                    line = self.re_autinline.search(line).group(2)
            elif self.re_by.search(line):
                line_groups = self.re_by.search(line)
                change = False
                try:
                    if self.re_inc.search(line_groups.group(2)) and \
                    self.re_authors.search(line_groups.group(2)):
                        change = True
                except MemoryError:
                    change = True
                if change:
                    new_line = line_groups.group(1) + "\n" + line_groups.group(2)
                    new_line = re.sub('(\s|\n|' + re.escape('\\') + ')+$', "",
                                      new_line)
                    meta_text = re.sub('' + re.escape(line) + '', new_line,
                                       meta_text)

        #Znovu ziskame vsechny radky v hlavicce
        lines = meta_text.splitlines()
        possible_titles = []

        #Postupne projdeme vsechny radky a ulozime ty, ktere by mohly byt nazvem
        for line in lines:
            #Radek, ktery obsahuje text "Title:" je vyhodnocen jako nazev
            if self.re_title.search(line):
                title = self.re_title.search(line).group(2)
                if self._re_meta.search(title):
                    title = self._re_meta.search(title).group(1)
                break
            try:
                if self.re_inc.search(line) and self.re_authors.search(line):
                    continue
            except MemoryError:
                continue
            if self._is_title_impossible(line) == True:
                continue

            #Radek splnil pozadavky
            possible_titles.append(line)

        if title == "":
            title = None
            for t in possible_titles:
                title = t
                if len(t.split(" ")) <= 1:
                    continue
                break

        return (title, meta_text_orig)


    def find_abstract(self, meta_text):
        """
        This method finds article abstract in text.
        
        @param meta_text: part of the text containing metadata
        @type meta_text: str
        @return: returns tuple - abstract and the rest of the text
        @rtype: (str, str)  
        """
        abstract = None
        if self._re_abstract_1.search(meta_text):
            abstract_gr = self._re_abstract_1.search(meta_text)
            meta_text = re.sub(re.escape(abstract_gr.group(0)), "", meta_text)
            abstract = abstract_gr.group(2)
        elif self._re_abstract_2.search(meta_text):
            abstract_gr = self._re_abstract_2.search(meta_text)
            meta_text = re.sub(re.escape(abstract_gr.group(0)), "", meta_text)
            abstract = abstract_gr.group(0)

        if abstract != None:
            if re.search('[a-z]', abstract, re.I) == None:
                abstract = None
            else:
                abstract = abstract.replace("\n", " ")

        return (abstract, meta_text)

    def find_keywords(self, meta_txt):
        """
        This method finds article keywords in text.
        
        @param meta_text: part of the text containing metadata
        @type meta_text: str
        @return: returns tuple - keywords and the rest of the text
        @rtype: ([RRSKeyword], str)  
        """
        keywords = []
        keywords_text = ""
        while self._re_keywords_1.search(meta_txt) \
        or self._re_keywords_2.search(meta_txt):
            if self._re_keywords_1.search(meta_txt):
                keywords_all = self._re_keywords_1.search(meta_txt)
                keywords_text = keywords_all.group(2)
                meta_txt = re.sub(re.escape(keywords_all.group(0)), "", meta_txt)
            elif self._re_keywords_2.search(meta_txt):
                keywords_all = self._re_keywords_2.search(meta_txt)
                keywords_text = keywords_all.group(2)
                if keywords_all.group(3) != "\n":
                    keywords_text += keywords_all.group(3).replace(".", "")
                meta_txt = re.sub(re.escape(keywords_all.group(0)), "", meta_txt)

        while re.search(self._pat_keywords, keywords_text):
            keywords_text = re.sub(self._pat_keywords, ", ", keywords_text)
        if keywords_text == "":
            return (keywords, meta_txt)

        c = 1
        is_upper = False
        while self.re_term.search(keywords_text):
            term = self.re_term.search(keywords_text)
            if re.search('[A-Z]', term.group(2)):
                if not is_upper and c > 3:
                    meta_txt += " !A_E! " + keywords_text
                    break
                else:
                    is_upper = True

            keywords.append(RRSKeyword(title=term.group(2)))
            keywords_text = re.sub(re.escape(term.group(0)), "", keywords_text)
            c += 1

        return (keywords, meta_txt)
Esempio n. 3
0
    def __init__(self):
        self.pat_month = "january|february|march|april|may|june|july|august|"
        self.pat_month += "september|october|november|december|jan\.?|feb\.?|"
        self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|"
        self.pat_month += "nov\.?|dec\.?"
        self.pat_date = "(" + self.pat_month + \
                        " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})"
        self.kws_unpublished = {
            "introduction": 50,
            "abstract": 50,
            "related work": 50
        }
        self.kws_article = {
            "(vol\.?|volume)\s*[0-9]+": 20,
            self.pat_date: 20,
            "(pages?|pp?\.?)\s*[-0-9]+(?!\))": 20,
            "(number|no\.?)\s*[_0-9]+": 20,
            "copyright": 20,
            "all rights reserved": 20,
            "journal": 20,
            "is published": 20,
            "published in": 20,
            "first published": 20,
            "in press": 20,
            "introduction": 50,
            "abstract": 50,
            "related work": 50,
            self._find_events: 50,
            self._find_events_2: 20
        }
        self.kws_techreport = {
            "this report": 200,
            "tech[a-z]+ report": 200,
            "summary report": 100,
            "is (a )?report": 80
        }
        self.kws_phdthesis = {
            "supervisor": 100,
            "this thesis": 200,
            "dissertation": 200,
            "Ph\.?D thesis": 200
        }
        self.kws_masterthesis = {
            "supervisor": 100,
            "this thesis": 200,
            "master thesis": 200,
            "master\W?s thesis": 200
        }
        self.types = {
            UNPUBLISHED: self.kws_unpublished,
            ARTICLE: self.kws_article,
            PHDTHESIS: self.kws_phdthesis,
            MASTERTHESIS: self.kws_masterthesis,
            TECHREPORT: self.kws_techreport
        }

        self.re_proceedings = re.compile('\W(proceedings|conference)\W',
                                         re.DOTALL)
        self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE)
        self.re_oo_impress = re.compile('impress', re.IGNORECASE)
        self.re_pages = re.compile('Pages:\s*([0-9]+)')

        self.pat_chapters = "R(eferences|EFERENCES)"
        self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W',
                                      re.DOTALL)

        #Proceedings articles patterns and RE
        self.pat_time = "[0-2]?[0-9]:[0-9][0-9]"
        self.pat_toc_page = "(\.\s*){2,}[0-9]+\n"
        self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)"
        self.pat_roman_nums = self.pat_pagesep + \
             "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \
            self.pat_pagesep
        self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \
                self.pat_pagesep + ")"
        self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)"
        self.re_proceedings_prefix_strict = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end_strict + "|" + \
                       self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE)
        self.re_proceedings_prefix = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \
                       "))", re.DOTALL | re.IGNORECASE)
        self.re_abstract = \
            re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n",
                       re.DOTALL | re.IGNORECASE)
        self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)",
                                         re.DOTALL | re.IGNORECASE)

        self.re_multi_strict = \
            re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \
                       self.pat_page_end_strict + ')', re.DOTALL)
        self.re_previous_1 = \
            re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \
                       '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \
                       '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \
                       self.pat_page_end + ')', re.DOTALL)
        self.re_previous_2 = \
            re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \
                       'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \
                       'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])',
                       re.DOTALL)

        self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)
Esempio n. 4
0
class DocumentInfo(object):
    """
    This class collects information about PDF documents.
    """
    def __init__(self):
        self.pat_month = "january|february|march|april|may|june|july|august|"
        self.pat_month += "september|october|november|december|jan\.?|feb\.?|"
        self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|"
        self.pat_month += "nov\.?|dec\.?"
        self.pat_date = "(" + self.pat_month + \
                        " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})"
        self.kws_unpublished = {
            "introduction": 50,
            "abstract": 50,
            "related work": 50
        }
        self.kws_article = {
            "(vol\.?|volume)\s*[0-9]+": 20,
            self.pat_date: 20,
            "(pages?|pp?\.?)\s*[-0-9]+(?!\))": 20,
            "(number|no\.?)\s*[_0-9]+": 20,
            "copyright": 20,
            "all rights reserved": 20,
            "journal": 20,
            "is published": 20,
            "published in": 20,
            "first published": 20,
            "in press": 20,
            "introduction": 50,
            "abstract": 50,
            "related work": 50,
            self._find_events: 50,
            self._find_events_2: 20
        }
        self.kws_techreport = {
            "this report": 200,
            "tech[a-z]+ report": 200,
            "summary report": 100,
            "is (a )?report": 80
        }
        self.kws_phdthesis = {
            "supervisor": 100,
            "this thesis": 200,
            "dissertation": 200,
            "Ph\.?D thesis": 200
        }
        self.kws_masterthesis = {
            "supervisor": 100,
            "this thesis": 200,
            "master thesis": 200,
            "master\W?s thesis": 200
        }
        self.types = {
            UNPUBLISHED: self.kws_unpublished,
            ARTICLE: self.kws_article,
            PHDTHESIS: self.kws_phdthesis,
            MASTERTHESIS: self.kws_masterthesis,
            TECHREPORT: self.kws_techreport
        }

        self.re_proceedings = re.compile('\W(proceedings|conference)\W',
                                         re.DOTALL)
        self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE)
        self.re_oo_impress = re.compile('impress', re.IGNORECASE)
        self.re_pages = re.compile('Pages:\s*([0-9]+)')

        self.pat_chapters = "R(eferences|EFERENCES)"
        self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W',
                                      re.DOTALL)

        #Proceedings articles patterns and RE
        self.pat_time = "[0-2]?[0-9]:[0-9][0-9]"
        self.pat_toc_page = "(\.\s*){2,}[0-9]+\n"
        self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)"
        self.pat_roman_nums = self.pat_pagesep + \
             "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \
            self.pat_pagesep
        self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \
                self.pat_pagesep + ")"
        self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)"
        self.re_proceedings_prefix_strict = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end_strict + "|" + \
                       self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE)
        self.re_proceedings_prefix = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \
                       "))", re.DOTALL | re.IGNORECASE)
        self.re_abstract = \
            re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n",
                       re.DOTALL | re.IGNORECASE)
        self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)",
                                         re.DOTALL | re.IGNORECASE)

        self.re_multi_strict = \
            re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \
                       self.pat_page_end_strict + ')', re.DOTALL)
        self.re_previous_1 = \
            re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \
                       '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \
                       '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \
                       self.pat_page_end + ')', re.DOTALL)
        self.re_previous_2 = \
            re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \
                       'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \
                       'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])',
                       re.DOTALL)

        self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)

    def _find_events(self, text):
        events = self.rrsdict_events.text_search(text, False, RET_ORIG_TERM)
        if len(events) > 0:
            return True
        else:
            return False

    def _find_events_2(self, text):
        if re.search("[0-9A-Z]+(/[0-9A-Z]+)+", text):
            return True
        else:
            return False

    def _is_proceedings(self, text):
        if self.re_proceedings.search(text):
            return True
        else:
            return False

    def _is_presentation(self, pdfinfo):
        if self.re_ms_powerpoint.search(pdfinfo):
            return True
        else:
            return False

    def _is_poster(self, pdfinfo):
        if self.re_pages.search(pdfinfo):
            pnum = int(self.re_pages.search(pdfinfo).group(1))
            if pnum == 1:
                return True
            else:
                return False
        else:
            return False

    def _shrink_text(self, text):
        if self.re_chapters.search(text):
            text = self.re_chapters.search(text).group(1)
        return text

    def get_pdfinfo(self, pdf_file_path):
        """
        Returns pdf metadata using pdfinfo program.
        """
        return commands.getoutput('pdfinfo ' + pdf_file_path)

    def get_document_type(self, text_file_path, pdf_file_path=None):
        """
        Main method.
        Returns type of specified document.
        """
        if pdf_file_path != None:
            pdfinfo = self.get_pdfinfo(pdf_file_path)
            if self._is_presentation(pdfinfo):
                return PRESENTATION
            elif self._is_poster(pdfinfo):
                return POSTER

        text_full = open(text_file_path, 'r').read()
        offset = 5000

        text_orig = text_full[0:offset]
        text_orig = self._shrink_text(text_orig)
        text = text_orig.lower()

        points = {}
        for type in self.types.keys():
            points[type] = [0, 0]
            for kw in self.types[type].keys():
                if isinstance(kw, str):
                    if re.search("(^|\W)" + kw + "(\W|$)", text, re.DOTALL):
                        points[type][0] += int(self.types[type][kw])
                        points[type][1] += 1
                else:
                    if kw(text_orig):
                        points[type][0] += int(self.types[type][kw])
                        points[type][1] += 1

        final_type = (MISC, 0)

        for type in points.keys():
            score = points[type][0]
            if score > final_type[1]:
                final_type = (type, points[type][0])

        if final_type[0] == ARTICLE and points[UNPUBLISHED][0] != 0:
            if final_type[1] - points[UNPUBLISHED][0] <= 20:
                final_type = (UNPUBLISHED, points[UNPUBLISHED][0])

        if final_type[0] == ARTICLE or final_type[0] == UNPUBLISHED:
            if self._is_proceedings(text):
                final_type = (INPROCEEDINGS, final_type[1])

        return final_type[0]

    def get_articles(self, text):
        """
        In case of proceedings, this method returns list of contained articles.
        """
        articles = []

        if self.re_abstract.search(text):
            reduced_text = self.re_abstract.search(text).group(1)
        else:
            reduced_text = text

        if self.re_proceedings_prefix_strict.search(reduced_text):
            groups = self.re_proceedings_prefix_strict.search(reduced_text)
            text = re.sub(re.escape(groups.group(1)), "", text)
        elif self.re_proceedings_prefix.search(reduced_text):
            groups = self.re_proceedings_prefix.search(reduced_text)
            text = re.sub(re.escape(groups.group(1)), "", text)
        elif self.re_proceedings.search(reduced_text):
            groups = self.re_proceedings.search(reduced_text)
            text = re.sub(re.escape(groups.group(1)), "", text)

        while self.re_multi_strict.search(text):
            article = self.re_multi_strict.search(text).group(1)
            text = re.sub(re.escape(article), "", text)
            while self.re_previous_1.search(article):
                previous = self.re_previous_1.search(article).group(1)
                article = re.sub(re.escape(previous), "", article)
                if len(articles) > 0:
                    articles[len(articles) - 1] += "\n" + previous
            if self.re_previous_2.search(article) and len(articles) > 0:
                articles[len(articles) - 1] += "\n\n" + article
            else:
                articles.append(article)

        if len(articles) == 1:
            articles = []
        return articles
    def __init__(self):
        self.pat_month = "january|february|march|april|may|june|july|august|"
        self.pat_month += "september|october|november|december|jan\.?|feb\.?|"
        self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|"
        self.pat_month += "nov\.?|dec\.?"
        self.pat_date = "(" + self.pat_month + \
                        " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})"
        self.kws_unpublished = {"introduction":50, "abstract":50,
                                "related work":50}
        self.kws_article = {"(vol\.?|volume)\s*[0-9]+":20, self.pat_date:20,
                            "(pages?|pp?\.?)\s*[-0-9]+(?!\))":20,
                            "(number|no\.?)\s*[_0-9]+":20,
                            "copyright":20, "all rights reserved":20,
                            "journal":20, "is published":20, "published in":20,
                            "first published":20, "in press":20,
                            "introduction":50, "abstract":50,
                            "related work":50, self._find_events:50,
                            self._find_events_2:20}
        self.kws_techreport = {"this report":200, "tech[a-z]+ report":200,
                               "summary report":100, "is (a )?report":80}
        self.kws_phdthesis = {"supervisor":100, "this thesis":200,
                              "dissertation":200, "Ph\.?D thesis":200}
        self.kws_masterthesis = {"supervisor":100, "this thesis":200,
                                 "master thesis":200, "master\W?s thesis":200}
        self.types = {UNPUBLISHED:self.kws_unpublished, ARTICLE:self.kws_article,
                      PHDTHESIS:self.kws_phdthesis,
                      MASTERTHESIS:self.kws_masterthesis,
                      TECHREPORT:self.kws_techreport}

        self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL)
        self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE)
        self.re_oo_impress = re.compile('impress', re.IGNORECASE)
        self.re_pages = re.compile('Pages:\s*([0-9]+)')

        self.pat_chapters = "R(eferences|EFERENCES)"
        self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W',
                                      re.DOTALL)

        #Proceedings articles patterns and RE
        self.pat_time = "[0-2]?[0-9]:[0-9][0-9]"
        self.pat_toc_page = "(\.\s*){2,}[0-9]+\n"
        self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)"
        self.pat_roman_nums = self.pat_pagesep + \
             "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \
            self.pat_pagesep
        self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \
                self.pat_pagesep + ")"
        self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)"
        self.re_proceedings_prefix_strict = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end_strict + "|" + \
                       self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE)
        self.re_proceedings_prefix = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \
                       "))", re.DOTALL | re.IGNORECASE)
        self.re_abstract = \
            re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n",
                       re.DOTALL | re.IGNORECASE)
        self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)",
                                         re.DOTALL | re.IGNORECASE)
        
        self.re_multi_strict = \
            re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \
                       self.pat_page_end_strict + ')', re.DOTALL)
        self.re_previous_1 = \
            re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \
                       '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \
                       '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \
                       self.pat_page_end + ')', re.DOTALL)
        self.re_previous_2 = \
            re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \
                       'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \
                       'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])',
                       re.DOTALL)


        self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)
class DocumentInfo(object):
    """
    This class collects information about PDF documents.
    """

    def __init__(self):
        self.pat_month = "january|february|march|april|may|june|july|august|"
        self.pat_month += "september|october|november|december|jan\.?|feb\.?|"
        self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|"
        self.pat_month += "nov\.?|dec\.?"
        self.pat_date = "(" + self.pat_month + \
                        " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})"
        self.kws_unpublished = {"introduction":50, "abstract":50,
                                "related work":50}
        self.kws_article = {"(vol\.?|volume)\s*[0-9]+":20, self.pat_date:20,
                            "(pages?|pp?\.?)\s*[-0-9]+(?!\))":20,
                            "(number|no\.?)\s*[_0-9]+":20,
                            "copyright":20, "all rights reserved":20,
                            "journal":20, "is published":20, "published in":20,
                            "first published":20, "in press":20,
                            "introduction":50, "abstract":50,
                            "related work":50, self._find_events:50,
                            self._find_events_2:20}
        self.kws_techreport = {"this report":200, "tech[a-z]+ report":200,
                               "summary report":100, "is (a )?report":80}
        self.kws_phdthesis = {"supervisor":100, "this thesis":200,
                              "dissertation":200, "Ph\.?D thesis":200}
        self.kws_masterthesis = {"supervisor":100, "this thesis":200,
                                 "master thesis":200, "master\W?s thesis":200}
        self.types = {UNPUBLISHED:self.kws_unpublished, ARTICLE:self.kws_article,
                      PHDTHESIS:self.kws_phdthesis,
                      MASTERTHESIS:self.kws_masterthesis,
                      TECHREPORT:self.kws_techreport}

        self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL)
        self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE)
        self.re_oo_impress = re.compile('impress', re.IGNORECASE)
        self.re_pages = re.compile('Pages:\s*([0-9]+)')

        self.pat_chapters = "R(eferences|EFERENCES)"
        self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W',
                                      re.DOTALL)

        #Proceedings articles patterns and RE
        self.pat_time = "[0-2]?[0-9]:[0-9][0-9]"
        self.pat_toc_page = "(\.\s*){2,}[0-9]+\n"
        self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)"
        self.pat_roman_nums = self.pat_pagesep + \
             "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \
            self.pat_pagesep
        self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \
                self.pat_pagesep + ")"
        self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)"
        self.re_proceedings_prefix_strict = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end_strict + "|" + \
                       self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE)
        self.re_proceedings_prefix = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \
                       "))", re.DOTALL | re.IGNORECASE)
        self.re_abstract = \
            re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n",
                       re.DOTALL | re.IGNORECASE)
        self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)",
                                         re.DOTALL | re.IGNORECASE)
        
        self.re_multi_strict = \
            re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \
                       self.pat_page_end_strict + ')', re.DOTALL)
        self.re_previous_1 = \
            re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \
                       '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \
                       '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \
                       self.pat_page_end + ')', re.DOTALL)
        self.re_previous_2 = \
            re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \
                       'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \
                       'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])',
                       re.DOTALL)


        self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)


    def _find_events(self, text):
        events = self.rrsdict_events.text_search(text, False, RET_ORIG_TERM)
        if len(events) > 0:
            return True
        else:
            return False


    def _find_events_2(self, text):
        if re.search("[0-9A-Z]+(/[0-9A-Z]+)+", text):
            return True
        else:
            return False


    def _is_proceedings(self, text):
        if self.re_proceedings.search(text):
            return True
        else:
            return False


    def _is_presentation(self, pdfinfo):
        if self.re_ms_powerpoint.search(pdfinfo):
            return True
        else:
            return False


    def _is_poster(self, pdfinfo):
        if self.re_pages.search(pdfinfo):
            pnum = int(self.re_pages.search(pdfinfo).group(1))
            if pnum == 1:
                return True
            else:
                return False
        else:
            return False


    def _shrink_text(self, text):
        if self.re_chapters.search(text):
            text = self.re_chapters.search(text).group(1)
        return text


    def get_pdfinfo(self, pdf_file_path):
        """
        Returns pdf metadata using pdfinfo program.
        """
        return commands.getoutput('pdfinfo ' + pdf_file_path)


    def get_document_type(self, text_file_path, pdf_file_path=None):
        """
        Main method.
        Returns type of specified document.
        """
        if pdf_file_path != None:
            pdfinfo = self.get_pdfinfo(pdf_file_path)
            if self._is_presentation(pdfinfo):
                return PRESENTATION
            elif self._is_poster(pdfinfo):
                return POSTER

        text_full = open(text_file_path, 'r').read()
        offset = 5000

        text_orig = text_full[0:offset]
        text_orig = self._shrink_text(text_orig)
        text = text_orig.lower()

        points = {}
        for type in self.types.keys():
            points[type] = [0, 0]
            for kw in self.types[type].keys():
                if isinstance(kw, str):
                    if re.search("(^|\W)" + kw + "(\W|$)", text, re.DOTALL):
                        points[type][0] += int(self.types[type][kw])
                        points[type][1] += 1
                else:
                    if kw(text_orig):
                        points[type][0] += int(self.types[type][kw])
                        points[type][1] += 1

        final_type = (MISC, 0)

        for type in points.keys():
            score = points[type][0]
            if score > final_type[1]:
                final_type = (type, points[type][0])

        if final_type[0] == ARTICLE and points[UNPUBLISHED][0] != 0:
            if final_type[1] - points[UNPUBLISHED][0] <= 20:
                final_type = (UNPUBLISHED, points[UNPUBLISHED][0])

        if final_type[0] == ARTICLE or final_type[0] == UNPUBLISHED:
            if self._is_proceedings(text):
                final_type = (INPROCEEDINGS, final_type[1])

        return final_type[0]
    
    def get_articles(self, text):
        """
        In case of proceedings, this method returns list of contained articles.
        """
        articles = []
        
        if self.re_abstract.search(text):
            reduced_text = self.re_abstract.search(text).group(1)
        else:
            reduced_text = text

        if self.re_proceedings_prefix_strict.search(reduced_text):
            groups = self.re_proceedings_prefix_strict.search(reduced_text)
            text = re.sub(re.escape(groups.group(1)), "", text)
        elif self.re_proceedings_prefix.search(reduced_text):
            groups = self.re_proceedings_prefix.search(reduced_text)
            text = re.sub(re.escape(groups.group(1)), "", text)
        elif self.re_proceedings.search(reduced_text):
            groups = self.re_proceedings.search(reduced_text)
            text = re.sub(re.escape(groups.group(1)), "", text)
        
        while self.re_multi_strict.search(text):
            article = self.re_multi_strict.search(text).group(1)
            text = re.sub(re.escape(article), "", text)
            while self.re_previous_1.search(article):
                previous = self.re_previous_1.search(article).group(1)
                article = re.sub(re.escape(previous), "", article)
                if len(articles) > 0:
                    articles[len(articles) - 1] += "\n" + previous
            if self.re_previous_2.search(article)and len(articles) > 0:
                articles[len(articles) - 1] += "\n\n" + article
            else:
                articles.append(article)

        if len(articles) == 1:
            articles = []
        return articles