Example #1
0
def get_bag_of_words(urls):
    docs = get_documents(urls)
    bag_of_words = {}
    for url in docs.keys():
        bof = process_text(docs[url])
        bof = valid_words(bof)
        bag_of_words[url] = bof
    return bag_of_words
Example #2
0
def get_bag_of_words(urls):
 docs = get_documents(urls)
 bag_of_words = {}
 for url in docs.keys():
  bof = process_text(docs[url])
  bof = valid_words(bof)
  bag_of_words[url] = bof
 return bag_of_words
  def getBackwardLinks(self, urls, session):

    es_info = self.esInfo(session['domainId'])
    
    results = field_exists("crawled_backward", [es_info['mapping']['url']], self._all, es_info['activeCrawlerIndex'], es_info['docType'], self._es)    
    already_crawled = [result[es_info["mapping"]["url"]][0] for result in results]
    not_crawled = list(Set(urls).difference(already_crawled))
    results = get_documents(not_crawled, es_info["mapping"]['url'], [es_info["mapping"]['url']], es_info['activeCrawlerIndex'], es_info['docType'], self._es)
    not_crawled_urls = [results[url][0][es_info["mapping"]["url"]][0] for url in not_crawled]

    chdir(environ['DDT_HOME']+'/seeds_generator')
        
    comm = "java -cp target/seeds_generator-1.0-SNAPSHOT-jar-with-dependencies.jar StartCrawl -c backward"\
           " -u \"" + ",".join(not_crawled_urls) + "\"" + \
           " -t " + session["pagesCap"] + \
           " -i " + es_info['activeCrawlerIndex'] + \
           " -d " + es_info['docType'] + \
           " -s " + es_server 
    
    p=Popen(comm, shell=True, stderr=PIPE)
    output, errors = p.communicate()
    print output
    print errors
def get_bigrams_trigrams(text=[], termCount=20, w2v=None, es=None):
        
        bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
        bigram_analyze = bigram_vectorizer.build_analyzer()
        trigram_vectorizer = CountVectorizer(ngram_range=(3,3))
        trigram_analyze = trigram_vectorizer.build_analyzer()
        
        bi_results= map(lambda t: bigram_analyze(t), text)
        tri_results= map(lambda t: trigram_analyze(t), text)
        
        bigrams = []
        bi_dict_corpus = {}
        for doc in bi_results:
                bi_dict={}
                for bi in doc:
                        bi=bi.replace(' ','_')
                        if bi in bi_dict:
                                bi_dict[bi] = bi_dict[bi] + 1
                        else:
                                bi_dict[bi] = 1 
                                
                if bi_dict:
                        # Yamuna: Removing for now as it is slow
                        #phrases = remove_stopword_phrases(bi_dict.keys())        
                        phrases = bi_dict.keys()
                        if w2v.word_vec is None:
                                results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es)
                                phrases = [res.lower() for res in results.keys()]
                        else:
                                phrases = [term for term in phrases if not w2v.get(term) is None]
                        
        
                        bi_dict_subset = {phrase: bi_dict[phrase] for phrase in phrases}
                        if bi_dict_subset:
                                bigrams.append(bi_dict_subset)
                                for phrase in bi_dict_subset.keys():
                                        if phrase in bi_dict_corpus:
                                                bi_dict_corpus[phrase] = bi_dict_corpus[phrase] + bi_dict_subset[phrase]
                                        else:
                                                bi_dict_corpus[phrase] = bi_dict_subset[phrase]
                                                
                        
        trigrams = []
        tri_dict_corpus = {}
        for doc in tri_results:
                tri_dict={}
                for tri in doc:
                        tri=tri.replace(' ','_')
                        if tri in tri_dict:
                                tri_dict[tri] = tri_dict[tri] + 1
                        else:
                                tri_dict[tri] = 1
                if tri_dict:
                        # Yamuna: Removing for now as it is slow
                        #phrases = remove_stopword_phrases(tri_dict.keys())        
                        phrases = tri_dict.keys()
                        if w2v.word_vec is None:
                                results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es)
                                phrases = [res for res in results.keys()]
                        else:
                                phrases = [term for term in phrases if not w2v.get(term) is None]

                        tri_dict_subset = {phrase: tri_dict[phrase] for phrase in phrases}
                        if tri_dict_subset:
                                trigrams.append(tri_dict_subset)
                                for phrase in tri_dict_subset.keys():
                                        if phrase in tri_dict_corpus:
                                                tri_dict_corpus[phrase] = tri_dict_corpus[phrase] + tri_dict_subset[phrase]
                                        else:
                                                tri_dict_corpus[phrase] = tri_dict_subset[phrase]
                                                
        return bigrams, trigrams, sorted(bi_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount], sorted(tri_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount]
  def setPagesTag(self, pages, tag, applyTagFlag, session):
    
    es_info = self.esInfo(session['domainId'])

    entries = {}
    results = get_documents(pages, 'url', [es_info['mapping']['tag']], es_info['activeCrawlerIndex'], es_info['docType'],  self._es)

    if applyTagFlag and len(results) > 0:
      print '\n\napplied tag ' + tag + ' to pages' + str(pages) + '\n\n'
      
      for page in pages:
        if not results.get(page) is None:
          # pages to be tagged exist
          records = results[page]
          for record in records:
            entry = {}
            if record.get(es_info['mapping']['tag']) is None:
              # there are no previous tags
              entry[es_info['mapping']['tag']] = tag
            else:
              current_tag = record[es_info['mapping']['tag']][0]
              tags = []
              if  current_tag != '':
                # all previous tags were removed
                tags = list(set(current_tag.split(';')))
                
              if len(tags) != 0:
                # previous tags exist
                if not tag in tags:
                  # append new tag    
                  entry[es_info['mapping']['tag']] = ';'.join(tags)+';'+tag
              else:
                # add new tag
                entry[es_info['mapping']['tag']] = tag

            if entry:
                  entries[record['id']] =  entry

    elif len(results) > 0:
      print '\n\nremoved tag ' + tag + ' from pages' + str(pages) + '\n\n'

      for page in pages:
        if not results.get(page) is None:
          records = results[page]
          for record in records:
            entry = {}
            if not record.get(es_info['mapping']['tag']) is None:
              current_tag = record[es_info['mapping']['tag']][0]
              if tag in current_tag:
                tags = list(set(current_tag.split(';')))
                tags.remove(tag)
                entry[es_info['mapping']['tag']] = ';'.join(tags)
                entries[record['id']] = entry
    
    if entries:
      update_try = 0
      while (update_try < 10):
        try:
          update_document(entries, es_info['activeCrawlerIndex'], es_info['docType'], self._es)
          break
        except:
          update_try = update_try + 1
  def createModel(self, session):
    es_info = self.esInfo(session['domainId']);

    data_dir = environ["DDT_HOME"] + "/data/"
    data_crawler  = data_dir + es_info['activeCrawlerIndex']
    data_training = data_crawler + "/training_data/"
    data_negative = data_crawler + "/training_data/negative/"
    data_positive = data_crawler + "/training_data/positive/"

    if (not isdir(data_positive)):
      makedirs(data_positive)
    if (not isdir(data_negative)):
      makedirs(data_negative)

    s_fields = {}
    query = {
      "wildcard": {es_info['mapping']["tag"]:"*Relevant*"}
    }
    s_fields["queries"] = [query]
    pos_urls = [field['url'][0] for field in multifield_term_search(s_fields, self._all, ["url", es_info['mapping']['tag']], 
                                    es_info['activeCrawlerIndex'], 
                                    es_info['docType'],
                                    self._es) if "irrelevant" not in field["tag"]]

    query = {
      "wildcard": {es_info['mapping']["tag"]:"*Irrelevant*"}
    }
    s_fields["queries"] = [query]
    neg_urls = [field['url'][0] for field in multifield_term_search(s_fields, self._all, ["url", es_info['mapping']['tag']], 
                                    es_info['activeCrawlerIndex'], 
                                    es_info['docType'],
                                    self._es)]

    pos_html = get_documents(pos_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType'])
    neg_html = get_documents(neg_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType'])

    seeds_file = data_crawler +"/seeds.txt"
    print "Seeds path ", seeds_file
    with open(seeds_file, 'w') as s:
      for url in pos_html:
        try:
          file_positive = data_positive + self.encode(url.encode('utf8'))
          print file_positive
          s.write(url.encode('utf8') + '\n')
          with open(file_positive, 'w') as f:
            f.write(pos_html[url][0][es_info['mapping']['html']][0])

        except IOError:
          _, exc_obj, tb = exc_info()
          f = tb.tb_frame
          lineno = tb.tb_lineno
          filename = f.f_code.co_filename
          linecache.checkcache(filename)
          line = linecache.getline(filename, lineno, f.f_globals)
          print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)

    for url in neg_html:
      try:
        file_negative = data_negative + self.encode(url.encode('utf8'))
        with open(file_negative, 'w') as f:
          f.write(neg_html[url][0]['html'][0])
      except IOError:
        _, exc_obj, tb = exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)
    
    models_dir = environ["DDT_HOME"] + "/vis/html/models/"
    crawlermodel_dir = models_dir + es_info['activeCrawlerIndex']
    
    if (not isdir(models_dir)):
      makedirs(models_dir)

    if (not isdir(crawlermodel_dir)):
      makedirs(crawlermodel_dir)

    ache_home = environ['ACHE_HOME']
    comm = ache_home + "/bin/ache buildModel -t " + data_training + " -o "+ crawlermodel_dir + " -c " + ache_home + "/config/stoplist.txt"
    p = Popen(comm, shell=True, stderr=PIPE)
    output, errors = p.communicate()
    print output
    print errors

    zip_filename = models_dir + es_info['activeCrawlerIndex'] + "_model.zip"
    with ZipFile(zip_filename, "w") as modelzip:
      if (isfile(crawlermodel_dir + "/pageclassifier.features")):
        print "zipping file: "+crawlermodel_dir + "/pageclassifier.features"
        modelzip.write(crawlermodel_dir + "/pageclassifier.features", "pageclassifier.features")
      
      if (isfile(crawlermodel_dir + "/pageclassifier.model")):
        print "zipping file: "+crawlermodel_dir + "/pageclassifier.model"
        modelzip.write(crawlermodel_dir + "/pageclassifier.model", "pageclassifier.model")

      if (exists(data_crawler + "/training_data/positive")):
        print "zipping file: "+ data_crawler + "/training_data/positive"
        for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/positive"):
          for html_file in filenames:
            modelzip.write(dirpath + "/" + html_file, "training_data/positive/" + html_file)

      if (exists(data_crawler + "/training_data/negative")):
        print "zipping file: "+ data_crawler + "/training_data/negative"
        for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/negative"):
          for html_file in filenames:
            modelzip.write(dirpath + "/" + html_file, "training_data/negative/" + html_file)
        
      if (isfile(data_crawler +"/seeds.txt")):
        print "zipping file: "+data_crawler +"/seeds.txt"
        modelzip.write(data_crawler +"/seeds.txt", es_info['activeCrawlerIndex'] + "_seeds.txt")

    chmod(zip_filename, 0o777)

    return "models/" + es_info['activeCrawlerIndex'] + "_model.zip"