Esempio n. 1
0
def setup_DUC_basic(task, skip_updates=False):
    """
    task.topic_file: sgml file for DUC
    task.doc_path: path containing source documents
    task.manual_path: path for manual (human) summaries
    """

    ## get all document data
    all_docs = {}
    files = util.get_files(task.doc_path, '\w{2,3}\d+[\.\-]\d+')
    sys.stderr.write('Loading [%d] files\n' % len(files))
    for file in files:
        id = os.path.basename(file)
        all_docs[id] = file

    ## initialize problems
    problems = []
    data = open(task.topic_file).read().replace('\n', ' ')
    topics = re.findall('<topic>.+?</topic>', data)
    sys.stderr.write('Setting up [%d] problems\n' % len(topics))
    for topic in topics:
        id = util.remove_tags(re.findall('<num>.+?</num>', topic)[0])[:-1]
        title = util.remove_tags(re.findall('<title>.+?</title>', topic)[0])
        narr = util.remove_tags(re.findall('<narr>.+?</narr>', topic)[0])
        docsets = re.findall('<docs.*?>.+?</docs.*?>', topic)
        docsets = map(util.remove_tags, docsets)
        docsets = [d.split() for d in docsets]

        old_docs = []
        for docset_index in range(len(docsets)):

            ## update naming convention different from main
            if len(docsets) > 1:
                id_ext = '-' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'[docset_index]
            else:
                id_ext = ''

            new_docs = [all_docs[doc] for doc in docsets[docset_index]]

            ## create a SummaryProblem
            problem = SummaryProblem(id + id_ext, title, narr, new_docs,
                                     old_docs)
            old_docs += new_docs

            ## include training data in problem
            if task.manual_path: problem._load_training(task.manual_path)

            problems.append(problem)

            ## skip updates?
            if skip_updates: break

    task.problems = problems
Esempio n. 2
0
def setup_DUC_basic(task, skip_updates=False):
    """
    task.topic_file: sgml file for DUC
    task.doc_path: path containing source documents
    task.manual_path: path for manual (human) summaries
    """

    ## get all document data
    all_docs = {}
    files = util.get_files(task.doc_path, '\w{2,3}\d+[\.\-]\d+')
    sys.stderr.write('Loading [%d] files\n' %len(files))
    for file in files:
        id = os.path.basename(file)
        all_docs[id] = file
    
    ## initialize problems
    problems = []
    data = open(task.topic_file).read().replace('\n', ' ')
    topics = re.findall('<topic>.+?</topic>', data)
    sys.stderr.write('Setting up [%d] problems\n' %len(topics))
    for topic in topics:
        id = util.remove_tags(re.findall('<num>.+?</num>', topic)[0])[:-1]
        title = util.remove_tags(re.findall('<title>.+?</title>', topic)[0])
        narr = util.remove_tags(re.findall('<narr>.+?</narr>', topic)[0])
        docsets = re.findall('<docs.*?>.+?</docs.*?>', topic)
        docsets = map(util.remove_tags, docsets)
        docsets = [d.split() for d in docsets]

        old_docs = []
        for docset_index in range(len(docsets)):
            
            ## update naming convention different from main
            if len(docsets) > 1: id_ext = '-' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'[docset_index]
            else: id_ext = ''

            new_docs = [all_docs[doc] for doc in docsets[docset_index]]

            ## create a SummaryProblem
            problem = SummaryProblem(id+id_ext, title, narr, new_docs, old_docs)
            old_docs += new_docs

            ## include training data in problem
            if task.manual_path: problem._load_training(task.manual_path)

            problems.append(problem)

            ## skip updates?
            if skip_updates: break

    task.problems = problems
Esempio n. 3
0
    def __init__(self, path, is_clean=False):
        """
        path is the location of the file to process
        is_clean=True means that file has no XML or other markup: just text
        """
        self.id = 'NONE'
        self.date = 'NONE'
        self.source = 'NONE'
        self.paragraphs = []
        self._isempty = True

        ## get generic info
        try:
            rawdata = open(path).read()
        except:
            sys.stderr.write('ERROR: could not read: %s\n' % path)
            return
        try:
            self.id = util.remove_tags(
                re.findall('<DOCNO>[^>]+</DOCNO>', rawdata[:100])[0])
        except:
            #<DOC id="AFP_ENG_20050125.0151" type="story" >
            match = re.search('<DOC id=\"([^"]+)\"', rawdata[:100])
            if match:
                self.id = str(match.groups(1))
            else:
                sys.stderr.write('ERROR: no <DOCNO>/<DOC id=...> tag: %s\n' %
                                 path)
                print rawdata[:100]

        ## source and date from id (assumes newswire style)
        if self.id != 'NONE':
            self.source = re.findall('^[^_\d]*', self.id)[0]
            self.date = self.id.replace(self.source, '')

        ## parse various types of newswire xml
        if is_clean: text = self._parse_clean(path)
        else: text = self._parse_newswire(path)

        if len(text) == 0:
            sys.stderr.write('WARNING: no text read for: %s\n' % path)
            return

        self.paragraphs = []
        for paragraph in text:
            fixed_par = self._fix_newswire(paragraph)
            if fixed_par == '': continue
            self.paragraphs.append(fixed_par)

        self._isempty = False
Esempio n. 4
0
    def __init__(self, path, is_clean=False):
        """
        path is the location of the file to process
        is_clean=True means that file has no XML or other markup: just text
        """
        self.id = 'NONE'
        self.date = 'NONE'
        self.source = 'NONE'
        self.paragraphs = []
        self._isempty = True

        ## get generic info
        if os.path.isfile(path): rawdata = open(path).read()
        elif path.strip().startswith('<DOC>'): rawdata = path
        else:
            sys.stderr.write('ERROR: could not read: %s\n' %path)
            return

        try: 
            self.id = util.remove_tags(re.findall('<DOCNO>[^>]+</DOCNO>', rawdata[:100])[0])
        except:
            match = re.search('<DOC id=\"([^"]+)\"', rawdata[:100])
            if match:
                self.id = str(match.groups(1)[0])
            else:
                sys.stderr.write('ERROR: no <DOCNO>/<DOC id=...> tag: %s\n' %path)

        ## source and date from id (assumes newswire style)
        if self.id != 'NONE':
            self.source = re.findall('^[^_\d]*', self.id)[0]
            self.date = self.id.replace(self.source, '')

        ## parse various types of newswire xml
        if is_clean: text = self._parse_clean(rawdata)
        else: text = self._parse_newswire(rawdata)

        if len(text)==0:
            #sys.stderr.write('WARNING: no text read for: %s\n' %path)
            return

        self.paragraphs = []
        for paragraph in text:
            fixed_par = self._fix_newswire(paragraph)
            if fixed_par == '': continue
            self.paragraphs.append(fixed_par)
        
        self._isempty = False
Esempio n. 5
0
  url = "http://soe.stanford.edu/research/pers_index_results.php?index=%s" % chr(c)
  doc = util.dl_and_prep(url)
  results += re.findall(pat, doc)

print len(results), 'total professors'
output = []
for prof in results:
  pd = {}
  pd['lab_website'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['name'] = prof[1]
  #extract the primary deptmartment from within the <b> tags
  if '<b>' in prof[2]:
    pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0]
  else:
    pd['department'] = util.prep_department(util.remove_tags(prof[2]))
  research = prof[3].replace('&nbsp;', '').strip()
  if len(research) > 0:
    pd['keywords'] = util.split_and_clean(research, ',')
  
  pd['school'] = 'Stanford University'
  personal_page = util.dl_and_prep(pd['lab_website'])
  summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page)
  if summary:
    pd['research_summary'] = util.html_escape(summary[0].strip())
  try:
    pd['image'] = 'http://soe.stanford.edu/research/%s' % re.findall('\'(images/photos_faculty_staff/.*?)\'', personal_page)[0]
  except Exception:
    import pdb; pdb.set_trace()
  pd['title'] = re.findall("Title:</td><td class=\"data\">(.*?)</td>", personal_page)[0]  
  personal_website = re.findall("URL:</TD><TD class=\"data\"><a href='(.*?)'", personal_page)
Esempio n. 6
0
               line = re.sub(ur'([^0-9]|^)1([^0-9])', ur'\1一\2', line, re.UNICODE)
               # replace <SB> with period
               #line = re.sub(ur'<SB>$', ur'.', line, re.UNICODE)
               #line = remove_tags(line)
               # TODO: should not remove these lines!!
               #if re.match(ur'[\.,\!\?]+$', line, re.UNICODE):
               #   line = ''
               fout.write('%s\n' % line)
   else:
      time_format = check_format(input)
      with codecs.open(input, 'r', encoding='utf-8') as fin, codecs.open(output, 'w', encoding='utf-8') as fout:
         for line in fin:
            line = line.strip()
            if len(line) == 0:
               fout.write('%s\n' % line)
            else:
               line = remove_tags(line)
               line = re.sub(ur'([^0-9]|^)1([^0-9])', ur'\1一\2', line, re.UNICODE)
               line = re.sub(ur'([,\.\?!"])+', ur'\1', line, flags=re.UNICODE)
               if len(line) > 0:
                  if time_format == 'int':
                     try:
                        uid, start, end, text = re.split('\s+', line, 3)
                     # some line has empty text after removing tags
                     except ValueError:
                        continue
                     fout.write('%04d - %s - %s\n%s\n' % (int(uid), int2time(start), int2time(end), text))
                  else:
                     fout.write('%s\n' % line)