Example #1
0
def main():
  # Setup
  pd.set_option('display.max_rows', None)
  pd.set_option('display.max_colwidth', 200)

  article_dir       = '../articles'
  png_dir           = '../pngs'
  training_set_size = 2000
  top_choices       = 10

  # Validate article dir
  if not os.path.exists(article_dir):
    print('Article directory does not exist!', file=sys.stderr)
    return

  # Get all pdfs inside dir
  print(f'Finding articles in directory: {article_dir}')
  pdf_paths = [f'{article_dir}/{x}' for x in os.listdir(article_dir)]
  pdf_paths = [x for x in pdf_paths if os.path.isfile(x) and x.endswith('.pdf')]

  # Validate articles
  if not pdf_paths:
    print('Article directory has no PDF files!', file=sys.stderr)
    return

  # Create article objects for pdfs found
  print('Tokenizing all pdf files found...')
  articles = []
  for pdf_path in pdf_paths:
    try:
      articles.append(Article(pdf_path))
    except (FileNotFoundError, UnicodeDecodeError):
      print(f'Article path {pdf_path} is not a valid file!', file=sys.stderr)
    except LookupError:
      print('NLTK lookup error, try nltk.download(\'punkt\')', file=sys.stderr)
      return

  # Read articles
  print('Extracting all sentences from articles into dataframe...')
  sentence_rows = []
  for article in articles:
    for sentence in article.get_sentences():

      # Filter out some useless sentences
      if not labelers.simple_filter(sentence):
        continue

      # Fill this sentence's fields
      s_dict = {}
      s_dict['article']  = article
      s_dict['sentence'] = sentence

      sentence_rows.append(s_dict)

  # Validate sentences
  if len(sentence_rows) < training_set_size:
    print(f'Could not extract enough ({training_set_size}) sentences!',
      file=sys.stderr)
    return

  # Build training and full sentence sets
  all_sentences = pd.DataFrame(sentence_rows)
  trn_sentences = pd.DataFrame(all_sentences['sentence'].sample(
    training_set_size, random_state=1))
  tst_sentences = pd.DataFrame(all_sentences['sentence'])

  # Build classifiers for all categories
  print('Building classifiers...')
  classifiers = []
  classifiers.append(Classifier(labelers.registered_software, 'software'))
  classifiers.append(Classifier(labelers.registered_species,  'species'))
  classifiers.append(Classifier(labelers.registered_sample,   'sample'))
  classifiers.append(Classifier(labelers.registered_method,   'method'))
  classifiers.append(Classifier(labelers.registered_molecule, 'molecule'))
  classifiers.append(Classifier(labelers.registered_property, 'property'))

  # Train all classifiers on the given data
  print('Training classifier models...')
  for i, cl in enumerate(classifiers):
    cl.train(trn_sentences)
    print(f'{i + 1} / {len(classifiers)}...')

  # Run all classifiers on full data
  print('Running classifier models on full corpus...')
  for cl in classifiers:
    try:
      predictions = cl.classify(tst_sentences)
    except RuntimeError as e:
      print(e, file=sys.stderr)
      continue

    stat_string = f'* Classified with "{cl.get_name().upper()}" labels *'
    print('*' * len(stat_string))
    print(stat_string)
    print('*' * len(stat_string))

    # Add predictions to article
    for prediction, article in zip(predictions, all_sentences['article']):
      article.add_prediction(prediction)

    # Get most used terms for CLASS predictions
    all_dicts     = []
    filtered_tags = ('DT', 'IN', 'CC', 'EX', 'TO', 'WDT', 'PRP',
                    'VBG', 'CD', 'WRB', 'MD', 'VBZ', 'RP', 'SYM',
                    'UH', 'PRP', 'PRP$', 'RB', 'RBS', 'WP', 'VB')
    for article in articles:
      curr_dict = {}

      for sentence, prediction in \
      zip(article.get_sentences(), article.get_predictions()):
        if prediction != 0:
          continue

        filtered_sentence = re.sub(r'[^\w\s]', '', sentence.lower(), re.UNICODE)
        toks = nltk.word_tokenize(filtered_sentence)
        tags = [x[1] for x in nltk.pos_tag(toks)]

        # Walk over sentence with two word sliding window
        for i in range(len(toks) - 1):
          w0           = toks[i]
          w1           = toks[i + 1]
          compound     = f'{w0} {w1}'

          # Skip over useless tags
          if (tags[i] in filtered_tags or
              (len(w0) == 1 and w0 in string.punctuation)):
            continue

          # Add single word
          count         = curr_dict.get(w0, 0) + 1
          curr_dict[w0] = count

          # Skip over useless tags
          if (tags[i + 1] in filtered_tags or
              (len(w1) == 1 and w1 in string.punctuation)):
            continue

          # Add two words
          count               = curr_dict.get(compound, 0) + 1
          curr_dict[compound] = count

      all_dicts.append((article, curr_dict))

      # Clean the predictions for next iteration
      article.clean_predictions()

    # Merge dictionaries
    main_dict = {}
    for article, d in all_dicts:
      for key in d.keys():
        main_get   = main_dict.get(key, (0, list()))
        main_count = main_get[0] + 1
        main_list  = main_get[1]
        main_list.append(article)

        main_dict[key] = (main_count, main_list)

    # Filter low freqs and sort by freq
    main_dict = dict(filter(lambda x: x[1][0] > 2, main_dict.items()))
    main_dict_ordered_keys = sorted(main_dict.keys(),
                                    key=lambda x: main_dict.get(x)[0],
                                    reverse=True)

    # Update top_choices if there are less available choices
    with open('common.txt', 'r') as c:
      common = c.readlines()

    for w in common:
      w = w.strip()
      if w in main_dict_ordered_keys:
        main_dict_ordered_keys.remove(w)

    top_choices = min(top_choices, len(main_dict_ordered_keys))

    print('Building relationship graph and keyword histogram...')
    rel_graph = RelGraph(cl.get_name().upper())
    histo = Histogram(cl.get_name().upper())

    for i, k in enumerate(main_dict_ordered_keys[:top_choices]):
      article_list = main_dict.get(k)[1]
      rel_graph.link_concept(k.upper(), article_list)
      histo.count_concept(k.upper(), article_list)

    try:
      print('Rendering graph...')
      rel_graph.cairo_render(f'{png_dir}/{cl.get_name()}', 2160)
      print(f'Success rendering to: {png_dir}/{cl.get_name()}.png')

      print('Rendering histogram...')
      histo.plot(f'{png_dir}/{cl.get_name()}_hist')
      print(f'Success rendering to: {png_dir}/{cl.get_name()}_hist.png')
    except Exception as e:
      print(f'Could not render. {repr(e)}')
Example #2
0
	def crawler(self):
		logging.info("Crawler activated with query filter %s" %self.target)
		# if self.sources.nb == 0:
		# 	sys.exit("Error: no sources found in the project.")
		try:
			self.project.load_sources()
			self.project.load_queue()
			self.project.load_logs()
		except AttributeError:
			self.load_project()





		#logging.info("Begin crawl with %i active urls"%self.sources.active_nb)
		self.push_to_queue()
		logging.info("Processing %i urls"%self.queue.count())
		if self.queue.count() == 0:
			self.update_status(self.task["name"], "running", False, "no pending url")
			self.config_crawl()
			return sys.exit("No pending url to process")

		#print self.queue.list

		while self.queue.count() > 0:
			try:
				for item in self.queue.find().sort([("depth", 1)]):
					self.update_status(self.task["name"], "running")
					if item["url"] in self.results.distinct("url"):
						logging.info("in results")
						self.queue.remove(item)

					elif item["url"] in self.logs.distinct("url"):
						logging.info("in logs")
						self.queue.remove(item)
					else:
						#print "Treating", item["url"], item["depth"]
						try:
							p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
						except KeyError:
							p = Page(item["url"], item["source_url"],item["depth"], self.date, True)
						if p.download():
							a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
							if a.extract():
								#Targeted crawk filtering for pertinency
								if self.target:
									if a.filter(self.query, self.directory):
										if a.check_depth(a.depth):
											a.fetch_links()
											if len(a.links) > 0:
												for url, domain in zip(a.links, a.domains):
													if url not in self.queue.distinct("url") and url not in self.results.distinct("url"):
														self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
														if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
													try:
														self.results.insert(a.export())
													except pymongo.errors.DuplicateKeyError:
														#self.results.update(a.export())
														pass

										else:
											logging.debug("depth exceeded")
											self.logs.insert(a.log())
									else:
										logging.debug("Not relevant")
										self.logs.insert(a.log())
								else:
									if a.check_depth(a.depth):
										a.fetch_links()
										if len(a.links) > 0:
											for url, domain in zip(a.links, a.domains):
												try:
													self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
												except pymongo.errors.DuplicateKeyError:
													pass
													if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
												try:
													self.results.insert(a.export())
												except pymongo.errors.DuplicateKeyError:
													pass
									else:
										logging.debug("Depth exceeded")
										try:
											self.logs.insert(a.log())
										except pymongo.errors.DuplicateKeyError:
											self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})

							else:
								logging.debug("Error Extracting")
								try:
									self.logs.insert(a.log())
								except pymongo.errors.DuplicateKeyError:
									self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})
						else:
							logging.debug("Error Downloading")
							self.logs.insert(p.log())

						self.queue.remove(item)
						logging.info("Processing %i urls"%self.queue.count())
					if self.queue.nb == 0:
						break
				if self.queue.nb == 0:
					break
				if self.results.count() > 200000:
					self.queue.drop()
					break
			except Exception as e:
				self.update_status(self.task["name"], "running", False, str(e))
		return sys.exit(1)
Example #3
0
from corona_api import Corona
from events import Events
from mood import Mood
from article import Article

crawler = Corona()

japan_data = crawler.get_confirmed_by_country("Japan")[-1]
world_data = crawler.get_confirmed_globally()

print(japan_data)
print(world_data)

events = Events()

events.event_process(japan_data, world_data)

processed_events = events.__dict__

mood = Mood()

mood.decision(processed_events)

article = Article(processed_events, mood.__dict__)

today_covid_article = article.generate()

print(today_covid_article)
Example #4
0
crawler = Crawler()

# 데이터 수집
weather_data = crawler.weather_fetch()
yesterday_data = crawler.yesterday_fetch()
print(weather_data)
print(yesterday_data)

# 날씨 이벤트 처리
weather_events = Events(weather_data)
weather_events.process_events()
weather_info = (weather_events.temp_max)

# 모델링
now_model = Model(weather_info)
yes_model = Model(yesterday_data)
today_visitor = now_model.modeling()
yesterday_visitor = yes_model.modeling()

# Mood decision
mood = Mood()
template = mood.decision(today_visitor, yesterday_visitor)
print(template)

# 기사생성
article = Article(template, weather_events, today_visitor, yesterday_data, yesterday_visitor)
print(article.generate())
f = open("better_than_yesterday.txt", "w", encoding = "UTF-8")
f.write(article.generate())
f.close()
'''
Created on Apr 9, 2016

@author: zhongzhu
'''

import traceback

from article import Article
from simplify import simplify_sen


with open("../data/set1/a2.txt") as f:
    article = Article(f.read())
    for s in article.sentences():
        try:
            if s:
                print("ORIGINAL:\n" + s)
                for sen in simplify_sen(s):
                    print("DERIVED:\n" + sen)
        except Exception as e:
            print("[Error]" + str(e))
            traceback.print_exc()
        print("")
Example #6
0
File: olm.py Project: DaveKP/olm
def generateSite():
    # Source markdown files
    articles = []
    draft_articles = []
    unlisted_articles = []
    pages = []
    subsites = set()
    logging.info("Scanning source files")
    time_source_start = time.time()
    for dirname, dirs, files in os.walk(CONTEXT.SOURCE_FOLDER):
        for filename in files:
            filepath = os.path.join(dirname, filename)
            relpath = os.path.relpath(filepath, CONTEXT.SOURCE_FOLDER)
            firstfolder = relpath.split(os.sep)[0]
            basename, extension = os.path.splitext(filename)
            if extension.lower() == ".md":
                if firstfolder[0] == "_":
                    subsites.add(firstfolder)
                elif firstfolder == "pages":
                    logging.debug("Found %s", filepath)
                    pages.append(Page(CONTEXT, filepath))
                else:
                    logging.debug("Found %s", filepath)
                    article = Article(CONTEXT, filepath)
                    if article.type in CONTEXT.ARTICLE_TYPES + CONTEXT.INDEX_TYPES:
                        if article.status == ArticleStatus.ACTIVE:
                            articles.append(article)
                        elif article.status == ArticleStatus.UNLISTED:
                            unlisted_articles.append(article)
                        else:
                            draft_articles.append(article)
    logging.info(
        "Processed %d articles, %d unlisted articles, %d drafts, and %d pages in %f seconds",
        len(articles), len(unlisted_articles), len(draft_articles), len(pages),
        time.time() - time_source_start)

    signal_sender = signal(Signals.AFTER_ALL_ARTICLES_READ)
    signal_sender.send((CONTEXT, articles))

    signal_sender = signal(Signals.BEFORE_WRITING)
    signal_sender.send((CONTEXT, Writer))

    logging.info("Writing %d articles", len(articles))
    time_write_start = time.time()
    for index, article in enumerate(articles):
        logging.debug("Writing file %d of %d", index + 1, len(articles))
        article.write_file()
    logging.info("Wrote %d articles in %f seconds", len(articles),
                 (time.time() - time_write_start))

    logging.info("Writing %d pages", len(pages))
    time_write_start = time.time()
    for index, page in enumerate(pages):
        logging.debug("Writing file %d of %d", index + 1, len(pages))
        page.write_file()
    logging.info("Wrote %d pages in %f seconds", len(pages),
                 (time.time() - time_write_start))

    # Index
    logging.info("Writing articles index")
    CONTEXT.ARTICLES = articles
    index = Index(CONTEXT)
    index.write_file()

    # Static files
    logging.info("Compiling static files")
    time_static_start = time.time()
    sass.compile(dirname=(os.path.join(CONTEXT.STATIC_FOLDER, 'css'),
                          os.path.join(CONTEXT.OUTPUT_FOLDER, 'theme', 'css')),
                 output_style='compressed')
    for dirname, dirs, files in os.walk(
            os.path.join(CONTEXT.STATIC_FOLDER, 'js')):
        for filename in files:
            filepath = os.path.join(dirname, filename)
            basename, extension = os.path.splitext(filename)
            rel_path = os.path.relpath(
                filepath, os.path.join(CONTEXT.STATIC_FOLDER, 'js'))
            if extension.lower() == ".js":
                with codecs.open(filepath, encoding='utf-8',
                                 errors='ignore') as js_file:
                    minified = js_file.read()  #jsmin(js_file.read())
                output_filepath = os.path.join(CONTEXT.OUTPUT_FOLDER, 'theme',
                                               'js', rel_path)
                os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
                with codecs.open(output_filepath, 'w+',
                                 encoding='utf-8') as js_min_file:
                    js_min_file.write(minified)
    logging.info("Processed static files in %f seconds",
                 time.time() - time_static_start)
    return subsites
import util.questionListGenerator as questionListGenerator
import traceback, sys, re

if __name__ == "__main__":

    if len(sys.argv) != 3:
        sys.exit("""
            Usage: ./ask.py article_file.htm N
            article_file    HTML file containing the article HTML content
            N               Number of questions to output.
        """)

    article_filename = sys.argv[1]
    num_questions = int(sys.argv[2])

    article = Article(article_filename, 'processed')
    sentences = article.get_sentence_list(False, False, False)

    # Fetch sentence candidates that can be converted into questions.
    selected_sentences = selector.process(sentences)
    # for sent in selected_sentences:
    # print sent

    # Use POS Tagging and Transformation rules to generate questions
    questions = questionListGenerator.process(selected_sentences,
                                              num_questions)

    # Select tops and print questions
    questions = questions[:num_questions]
    for question in questions:
        print question
Example #8
0
def get_json_from_articles(urls):
    '''
    Using the html content of each url that is previously extracted in order to create a Article object and serialize
    it in JSON format. The list of JSON objects is returned
    '''

    jsons = []

    for url in urls:
        url = urljoin("https://www.20min.ch/", url)

        page_content = get_content_from_url(url)

        if page_content:
            page_soup = BeautifulSoup(page_content, 'html.parser')
            story = page_soup.find('div', attrs={'class': 'story'})
            title = story.find('div', attrs={
                'class': 'story_titles'
            }).find('h1').text

            try:
                clearfix = story.find('div',
                                      attrs={
                                          'class': 'published clearfix'
                                      }).find('h4').text
            except:
                clearfix = '/'
            try:
                dateTime = story.find('div',
                                      attrs={
                                          'class': 'published clearfix'
                                      }).find('p').text.split(';')[0]
            except Exception:
                dateTime = '/'
            try:
                dateAkt = story.find(
                    'div', attrs={
                        'class': 'published clearfix'
                    }).find('p').find('span').text.strip("Akt: ")
            except Exception:
                dateAkt = '/'
            try:
                h3 = story.find('div', attrs={
                    'class': 'story_titles'
                }).find('h3').text
            except Exception:
                h3 = '/'

            photos_videos = []
            try:
                ph = story.find('div', attrs={
                    'class': 'story_media'
                }).find('div', attrs={
                    'class': 'ginfo'
                }).find_all('a')
                for p in ph:
                    photos_videos.append({
                        'description': p.text,
                        'url': p['href']
                    })
            except Exception:
                # stderr.write("Error extracting photos/videos")
                pass

            try:
                video = story.find('div', attrs={
                    'class': 'story_media'
                }).find('iframe')['src']
                caption = story.find('div', attrs={
                    'class': 'story_media'
                }).find('div', attrs={
                    'class': 'caption'
                }).text
                photos_videos.append({'description': caption, 'url': video})
            except Exception:
                # stderr.write("Error extracting photos/videos")
                pass

            story_text = []
            autor = ''

            s_text = story.find('div', attrs={
                'class': 'story_text'
            }).find_all('p')
            for s in s_text:
                try:
                    if s.attrs.get('class')[0] == 'autor':
                        autor = s.text
                except Exception:
                    story_text.append(s.text)

            article = Article(url, title, story_text, autor, clearfix,
                              dateTime, dateAkt, photos_videos)

            jsons.append(serialise_to_json(article))
            if url == urljoin("https://www.20min.ch/", urls[0]):
                with open("json_file.json", "w") as f:
                    json.dump(article, f, default=serialise_to_json, indent=4)
        else:
            stderr.write("Failed to retrieve link : " + url + "\n")

    return jsons
Example #9
0
 def __next__(self):
     row = next(self.reader)
     if row == '':
         raise StopIteration
     category, text = row
     return Article(text, category=category)
Example #10
0
    db = db_articles

    def get_cursor():
        if a.doc is not None:
            docs = db.find({'_id': ObjectId(a.doc)})
        elif a.person is not None:
            docs = db.find({'analysis.final.candidates.id': a.person})
        elif a.constituency is not None:
            docs = db.find({'constituencies.id': a.constituency})
        else:
            docs = db.find() \
                     .sort([('time_added', -1)])

        return docs

    page = 0

    docs = get_cursor().skip(100 * page).limit(100)

    while docs:
        print "PAGE", page

        for doc in docs:
            article = Article(doc)
            article.process()
            article.save()

        page += 1
        docs = get_cursor().skip(100 * page).limit(100)
def main():
    args = parse_args()
    root_name = args.xml_file.rsplit('.', 1)[0]
    spacy_model = 'en_core_sci_md'

    if not os.path.exists('cache'):
        os.mkdir('cache')

    articles = None
    pickle_file = 'cache/articles_{}.pkl'.format(root_name)
    if os.path.exists(pickle_file) and not args.force_read:
        print('loading from {}'.format(pickle_file))
        articles = pickle.load(open(pickle_file, 'rb'))
    else:
        print('reading from {}'.format(args.xml_file))
        records = read_xml(args.xml_file)
        print('found {} articles'.format(len(records)))
        records = filter_valid_records(records)
        nlp = spacy.load(spacy_model)
        articles = [Article(record, nlp) for record in tqdm(records)]
        pickle.dump(articles, open(pickle_file, 'wb'))

    print('using {} valid articles'.format(len(articles)))

    model = None
    model_file = 'cache/model_{}'.format(root_name)
    force_train = False
    corpus, dictionary = transform_corpus(articles)
    if os.path.exists(model_file) and not args.force_train:
        print('loading trained model from {}'.format(model_file))
        model = LdaModel.load(model_file)
    else:
        print('training LDA')
        model = train_lda(corpus, dictionary, num_topics=args.num_topics)
        model.save(model_file)

    print('Finished training/loading')
    cm = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    print('Topic coherence:', coherence)

    topics = model.show_topics(num_words=5, formatted=False)
    distributions = topic_distributions(corpus, model)
    distributions = to_ndarray(distributions, len(topics))
    topic_assignments = np.argmax(distributions, axis=1)
    histogram = hist(topic_assignments, len(topics))
    central_indices = find_central_articles(distributions, topic_assignments,
                                            len(topics))
    print('Num of topics: {}'.format(len(topics)))

    # below is for output
    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    output_files = []
    for i in range(len(topics)):
        f_name = os.path.join(args.out_dir, 'group{}.txt'.format(i))
        output_files.append(open(f_name, 'w'))

    for i, topic in enumerate(topic_assignments):
        output_files[topic].write('{}\n'.format('-' * 50))
        output_files[topic].write('{}\n'.format(str(articles[i])))
        output_files[topic].write('{}\n\n'.format('=' * 50))

    for f in output_files:
        f.close()

    grouping_file = os.path.join(args.out_dir, 'grouping.txt')
    with open(grouping_file, 'w') as f:
        for i in range(len(topics)):
            f.write('Topic {}: {} articles\n'.format(i, histogram[i]))
            f.write('word: {}\n'.format(str([t[0] for t in topics[i][1]])))
            f.write('Representative article:\n')
            a = articles[central_indices[i]]
            f.write('Title: {}\n'.format(a.title))
            f.write('Authors: {}\n'.format(', '.join(a.authors)))
            f.write('DOI: {}\n'.format(a.doi))
            f.write('\n')
Example #12
0
def getHaikuLines(loadedObjects):
    """
    Requires: loadedObjects contains a dictionary with a nltk dictionary,
    a pyphen dictionary, and a spaCy NLP object all properly named in
    accordance with the Article class.
    Effects: Returns a dictionary containing two lists containing news headlines
    of lengths 5 and 7 syllables long.
    """
    newsApi = NewsApiClient(api_key=newsAPIKey)
    # A list of all news sources we get our headlines from, subject to change
    newsSources = [
        'abc-news', 'al-jazeera-english', 'associated-press', 'bbc-news',
        'bleacher-report', 'bloomberg', 'business-insider', 'cbs-news', 'cnbc',
        'cnn', 'espn', 'financial-times', 'fox-news', 'msnbc',
        'national-geographic', 'nbc-news', 'politico', 'reuters',
        'the-economist', 'the-new-york-times', 'the-huffington-post',
        'the-washington-post', 'the-wall-street-journal', 'time', 'usa-today',
        'wired'
    ]
    # The lists that will hold all 5 and 7 syllable news headlines
    fiveSyllableLines = []
    sevenSyllableLines = []
    # Iterate through each news source in the list to get all headlines
    for newsSource in newsSources:
        # Getting the top headlines for a particular news source
        topHeadlines = newsApi.get_top_headlines(sources=newsSource,
                                                 page_size=100)
        # Go through all the article dictionaries in the list of top headlines
        # and extract information about each article.
        for articleInfo in topHeadlines['articles']:
            # Instantiation of an article object
            article = Article(articleInfo['title'], loadedObjects)
            # Gets the best 5 and 7 syllable lines respectively
            fiveSyllableLine = article.getBest(5)
            sevenSyllableLine = article.getBest(7)
            # If there is a 'best' five syllable line, add it to the list of
            # headlines, same for the seven syllable line.
            if fiveSyllableLine:
                # Makes sure that the first letter of each word is capitalized
                fiveSyllableLine = [word.title() for word in fiveSyllableLine]
                # Add a dictionary to the list containing info about the line
                fiveSyllableLines.append({
                    'title':
                    fiveSyllableLine,
                    'url':
                    articleInfo['url'],
                    'source':
                    articleInfo['source']['name'],
                    'entCount':
                    article.totalEntities(fiveSyllableLine)
                })
            if sevenSyllableLine:
                # Makes sure that the first letter of each word is capitalized
                sevenSyllableLine = [
                    word.title() for word in sevenSyllableLine
                ]
                # Add a dictionary to the list containing info about the line
                sevenSyllableLines.append({
                    'title':
                    sevenSyllableLine,
                    'url':
                    articleInfo['url'],
                    'source':
                    articleInfo['source']['name'],
                    'entCount':
                    article.totalEntities(sevenSyllableLine)
                })
    return {
        'fiveSyllableLines': fiveSyllableLines,
        'sevenSyllableLines': sevenSyllableLines
    }
def load_one_article(volume, filename):
    with open(filename, 'r', encoding='utf-8') as f:
        s = f.read()
        # print(s)

        # volume
        volume = volume
        # print('volume: {}'.format(volume))

        # number
        pos_number_start = re.search('Lesson', s).span()[0] + 7
        pos_number_end = pos_number_start + re.search(
            ':', s[pos_number_start:]).span()[0]
        number_str = s[pos_number_start:pos_number_end]
        number = int(number_str)
        # print('number: {}'.format(number))

        # index (volume_number)
        index = str(volume) + '_' + str(number)
        # print('index: {}'.format(index))

        # title
        pos_title_start = re.search(':', s).span()[0] + 1
        pos_title_end = pos_title_start + re.search(
            r'\n', s[pos_title_start:]).span()[0]
        title = s[pos_title_start:pos_title_end]
        pos_title_end_res = re.search(re.compile(r'[\u4e00-\u9fff]'), title)
        if pos_title_end_res is not None:
            pos_title_len = pos_title_end_res.span()[0]
            title = title[:pos_title_len]
        title = title.strip()
        # title = title.lower()
        # print('title: {}'.format(title))

        # text (main body)
        # two kinds:
        #   1) 课文内容: <text>  Notes on the text
        #   2) 听录音,然后回答以下*问题* <text>  New words and expressions
        pos_text_res = re.search('课文内容', s)
        pos_text_start = 0
        pos_text_end = -1
        if pos_text_res is not None:
            pos_text_start = pos_text_res.span()[0]
            pos_text_start = pos_text_start + re.search(
                r'\n', s[pos_text_start:]).span()[0]
            pos_text_end_res = re.search(r'Notes on the text', s)
            pos_text_end = -1
            if pos_text_end_res is None:
                pos_text_end_res = re.search(r'课文注译', s)
                if pos_text_end_res is None:
                    pos_text_end_res = re.search(r'课文注释', s)
                    if pos_text_end_res is None:
                        pos_text_end = re.search(r'参考译文', s).span()[0]
                    else:
                        pos_text_end = pos_text_end_res.span()[0]
                else:
                    pos_text_end = pos_text_end_res.span()[0]
            else:
                pos_text_end = pos_text_end_res.span()[0]
        else:
            pos_text_res = re.search('问题', s)
            pos_text_start = pos_text_res.span()[0]
            pos_text_end = re.search(r'New words and expression', s).span()[0]

        text = s[pos_text_start:pos_text_end]
        text = re.sub(re.compile(r'[\u4e00-\u9fff]'), '', text)
        text = re.sub(r"[。]+", "", text)
        text = re.sub(r"[\n\n]+", "\n", text)
        text = re.sub(r"(&lsquo;)", "", text)
        text = re.sub(r"(&rsquo;)", "", text)
        text = re.sub(r"( )+", " ", text)
        text = text.strip()
        # print(text)

        # generate one article object
        one_article = Article(volume=volume,
                              number=number,
                              index=index,
                              title=title,
                              text=text,
                              keywords='',
                              text_zh='')

        return one_article
Example #14
0
import os.path, sys

sys.path.append(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
from database import Database
from article import Article
from helper import Helper
from hmm import HiddenMarkovModel
import csv
import nltk
import dill

if __name__ == '__main__':
    module_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    syllables_dictionary = Helper.load_syllables_dictionary()
    article = Article('', 0, syllables_dictionary)

    #load model
    hmm_save_path = module_path + '/hmm/hmm_data/unsupervised_hmm_model.dill'
    hmm_tagger = HiddenMarkovModel()
    hmm_tagger.load_model(hmm_save_path)

    vlsp_folder_path = module_path + '/VLSP_Sentences/test'
    files = os.listdir(vlsp_folder_path)
    test_sentences = []
    for file in files:
        posts = Helper.read_vlsp_sentences(vlsp_folder_path + '/' + file)
        for post in posts:
            article = Article(post, 0, syllables_dictionary)
            article.detect_paragraph()
            for paragraph in article.paragraphs:
Example #15
0
            json.dump(corpus.articles, json_file, sort_keys=True, indent=4)

    print([title for title in corpus.articles.keys()])

    semi_cleaned_articles = {}
    for title, article in corpus.articles.items():
        semi_cleaned_articles[title] = {
            'pageid': article['pageid'],
            'text': corpus.strip_mediawiki_markup(article['text'])
        }

    # limit = 0
    article_sentences = {}
    all_articles = []
    for title, article in semi_cleaned_articles.items():
        all_articles.append(Article(title, article['pageid'], corpus.get_sentences_and_citations(article['text'])))
        # limit += 1
        # if limit == 3:
        #     break

    train, dev, test = corpus.get_corpus_splits(all_articles)

    # print(get_corpus_stats(all_articles))
    # print(get_corpus_stats(train))
    # print(get_corpus_stats(dev))
    # print(get_corpus_stats(test))

    save_as_json(all_articles, 'all_articles.json')
    save_as_json(train, 'train.json')
    save_as_json(dev, 'dev.json')
    save_as_json(test, 'test.json')
Example #16
0
from bs4 import BeautifulSoup

blog_url = "https://www.joyhwong.com"
pat_category_path = "/archives/category/pat/"
request_url = blog_url + pat_category_path
article_list = []

while True:
    r = requests.get(request_url)
    soup = BeautifulSoup(r.text, "html.parser")

    headers = soup.find_all('header', class_='entry-header')

    for header in headers:
        a = header.find("a")
        if re.findall('乙级', a.text) or (re.findall('甲级', a.text) and re.findall('C\+\+版', a.text)):
            continue
        try:
            article_list.append(Article(a['href'], a.text, re.findall('[0-9]{4}', a.text)[0]))
        except IndexError:
            None

    if soup.find('a', class_='next page-numbers') is None:
        break
    else:
        request_url = soup.find('a', class_='next page-numbers')['href']

for article in sorted(article_list):
    print(article.id, article.title, article.url)
    print()
Example #17
0
from article import Article
from http_ds import HttpDataSource
from sql_ds import SQLDataSource
from mongo_ds import DocDataSource
from person import Person

someArticle = Article("is very liked by juan, talks about WAR")
gameArticle = Article("Quake winner is fatality")

juan = Person("Juan")
ivan = Person("Ivan")
jess = Person("Jess")

http1 = HttpDataSource()
http2 = HttpDataSource()

db = SQLDataSource()
mongodb = DocDataSource()

juan.likeArticle(someArticle, http1)
juan.likeArticle(someArticle, http2)
juan.likeArticle(someArticle, db)
juan.likeArticle(someArticle, mongodb)

ivan.likeArticle(gameArticle, http1)

jess.likeArticle(gameArticle, mongodb)
Example #18
0
    async def on_message(self, message):

        if message.content.startswith('!'):
            return

        if message.channel.category.name == shopping_category:

            if message.author != self.bot.user:

                guild = message.guild

                for line in message.content.split("\n"):

                    quantity = extractInt(line)

                    if quantity == 0:
                        quantity = 1

                    article = Article(removeInt(line), quantity,
                                      message.author.name)
                    groceries_list = groceries_lists.get_groceries_list_by_id(
                        guild.id)

                    if groceries_list != None:

                        grocery = groceries_list.get_by_id(message.channel.id)

                        if grocery != None:

                            similar_article = grocery.get_similare(
                                article.similar_article)

                            if similar_article != None:

                                old_message = await message.channel.fetch_message(
                                    similar_article.message_id)
                                similar_article.add_quantity(article.quantity)
                                await old_message.edit(
                                    embed=similar_article.to_embed())

                                for i in add_number(
                                        similar_article.quantity -
                                        article.quantity,
                                        similar_article.quantity):
                                    await old_message.add_reaction(i)

                                await old_message.clear_reaction("➕")
                                await old_message.add_reaction("➕")

                            else:
                                new_message = await message.channel.send(
                                    embed=article.to_embed())
                                article.message_id = new_message.id
                                grocery.add(article)
                        else:
                            groceries_list.add(
                                Grocery(message.channel.id, [article]))
                            new_message = await message.channel.send(
                                embed=article.to_embed())
                            article.message_id = new_message.id
                    else:
                        groceries_lists.add(
                            GroceriesList(
                                guild, Grocery(message.channel.id, [article])))
                        new_message = await message.channel.send(
                            embed=article.to_embed())
                        article.message_id = new_message.id
                    await message.delete()
            else:
                await message.add_reaction("✅")
                embed_quantity = int(message.embeds[0].fields[0].value)
                for i in add_number(1, embed_quantity):
                    await message.add_reaction(i)
                await message.add_reaction("➕")
Example #19
0
            if index_file < 10:
                file_name = '0' + str(index_file)
            else:
                file_name = str(index_file)
            print 'Start handle data in file %s in folder %s' % (file_name, folder)

            wiki_data_path = '/viwiki_data/' + folder + '/wiki_' + file_name
            wiki_data_path = module_path + wiki_data_path
            doc_array = Helper.load_wiki_data(wiki_data_path)

            position_file = '%s_%s' % (folder, file_name)

            for index, doc in enumerate(doc_array):
                print 'Start training in doc %i of file %s' % (index, folder)
                number_doc += 1
                article = Article(doc, position_file, hmm_dictionary)
                article.detect_paragraph()
                for paragraph in article.paragraphs:
                    for sentence in paragraph:
                        number_sentence += 1
                        sentence = sentence.lower()
                        array_syllables = sentence.split()

                        sentence_object = Article(sentence, position_file, hmm_dictionary)
                        article_unlabeled_sequences = sentence_object.convert_syllable_to_number()
                        state_sequence = hmm.veterbi_algorithm(
                            article_unlabeled_sequences[0],
                            using_sub_params=True,
                            bigram_hash=bigram_hash,
                            invert_bigram_hash=invert_bigram_hash,
                            number_occurrences=statistic_bigram,
Example #20
0
 def add_headline(self, headline):
     url = headline.attr('href')
     title = headline.text()
     article = Article(url, title, self.name)
     if not url in [a.url for a in self.articles]:
         self.articles.append(article)
Example #21
0
 def execute_with_params(self, file_pattern="", path="", magazines=None,articles=None):
     listOfMagazines=[]
     container = dict()
     for folder, subs, files in os.walk(path):
         with open(os.path.join(folder, file_pattern), 'w') as dest:
             docValue=""
             prevId=""
             identifier=""
             page_count=0
             for filename in files:
                 #print("Filename "+filename)
                 if filename == file_pattern:
                     pass                    
                 elif filename.endswith(file_pattern):
                     document=""
                     if not folder in path:
                         document = folder+"/"+filename
                     else:
                         document = path+filename
                     doc = self.read(document)
                     if magazines==None:
                         pass
                         #container = self.saveValuesForNewIdentifier(container, identifier, doc)
                     elif len(doc)<1:
                         pass
                     else:
                         fileId = self.getMagazine(filename)
                         mag=magazines[fileId]
                         if fileId not in listOfMagazines:
                             listOfMagazines.append(fileId)
                         page = self.extractPageNumberFromFilename(filename)
                         if page > 0:
                             article = mag.find_article_by_page(page)
                             logger.debug(article)
                             logger.debug(filename)
                             
                             #get filename
                             name=""
                             str_name = filename.split("page")
                             try:
                                 name = str_name[0]
                             except:
                                 name = filename
                             mag.set_name(name)
                             if article != None: #If article exists
                                 if len(doc)<5 and len(values[-1:])<3900:
                                     logger.info("SMALL "+str(len(doc)) + " = "+doc)
                                     pass
                                 if len(doc)>0:
                                     self.setLengths(len(doc))
                                     #split value in case too long to process
                                     if len(doc)>3000:
                                         sentences=""
                                         splitted = doc.split(' ')
                                         for split in splitted:
                                             lenn = len(sentences) + len(split) + 1 #+1 for the space
                                             if lenn > 3000:
                                                 article.addText(sentences, page)
                                                 sentences = ""
                                             sentences += split+" "
                                         if len(sentences) > 0:
                                             article.addText(sentences, page)
                                     else:
                                         article.addText(doc, page)
                                         article.set_len(len(doc))
                             else:
                                 #in case we cannot find article
                                 article = Article(filename, page, "")
                                 article.addText(doc, page)
                                 article.set_len(len(doc))
                                 self.setLengths(len(doc))
                                 mag.add_article(article)    
                         else:
                                 #if article not found for the document, store pages page by page
                             print("stored file "+filename+" as article was not found")
                             article = Article(filename, page, "")
                             article.addText(doc, page)
                             article.set_len(len(doc))
                             mag.add_article(article) 
                             self.setLengths(len(doc))
                             
                         magazines[fileId] = mag 
                         #mag.log_articles_and_contents()
                         
                 elif filename.endswith(".xml"):
                     page_count = 1+page_count
                     article = Article(filename, page, "")
                     doc, document = self.readDocument(path, folder, filename, document)
                     if len(doc)>0:
                         xml = xmlParser(input_file=doc)
                         if bool(BeautifulSoup(html, "html.parser").find()) == True:
                             self.setLengths(len(doc))
                             article.set_len(len(doc))
                             html = htmlParser(doc)
                             article = self.split_document(html.get_text(), article, page_count)
                             result.append(article)
                     else:
                         #process
                         pass
                 elif filename.endswith(".html"):
                     page_count = 1+page_count
                     article = Article(filename, page, "")
                      
                     doc, document = self.readDocument(path, folder, filename, document)
                     if len(doc)>0:
                         self.setLengths(len(doc))
                         article.set_len(len(doc))
                         html = htmlParser(doc)
                         article = self.split_document(html.get_text(), article, page_count)
                         result.append(article)
                     else:
                         #process
                         pass
             dest.close()              
                  
     result = []
     logger.debug("VALUES FOR magazines "+str(len(listOfMagazines)))
     for id in listOfMagazines:
         if magazines[id] not in result:
             result.append(magazines[id])
     return result, listOfMagazines
def build_feature_table(name,
                        kind='feats',
                        meta_path=META_PATH,
                        corpus_path=CORPUS_PATH,
                        taglist=None):
    print("Begin")
    # Open the meta file.
    with open(meta_path) as meta_file:
        reader = csv.reader(meta_file, delimiter=';')
        meta = [line for line in reader]
    print("Read in metadata")

    articles = []

    # Create article objects based on the metadata.
    print("Begin creating articles")
    for item in meta:
        article = Article(item[0], corpus_path + str(item[0]), item[5],
                          item[2], item[6])
        articles.append(article)
        print("Read in Article %s\t--\t%s" % (article.index, article.head))
    print("All articles created")

    # Tries to create the corpus, making a feature representation.
    print("Begin creating corpus")
    if kind == 'feats':
        corpus = CorpusFeats(articles)
    elif kind in ['wordlist', 'poslist'] and taglist:
        corpus = CorpusTagList(articles, taglist, kind)
    elif kind in ['bow', 'bop', 'bos', 'bag']:
        corpus = CorpusBags(articles, kind=kind)
    else:
        print("Do not understand what feature set you want.")
    print("Corpus created")

    # Gets the training and test tables. (it is worth noting that we do not use the test sets in this work)
    train = corpus.get_training_table().tolist()
    train_labels = corpus.get_training_labels()
    train_indices = corpus.get_training_indices()
    test = corpus.get_testing_table().tolist()
    test_labels = corpus.get_testing_labels()
    test_indices = corpus.get_testing_indices()
    feature_names = corpus.get_feature_names()

    print("Commence Training Table Creation")
    # Add feature names and labels to training feature table
    train_table = [feature_names] + train
    train_table[0] = ['index'] + train_table[0] + ['class']
    train_table[1:] = [[index] + row + [label] for index, row, label in zip(
        train_indices, train_table[1:], train_labels)]

    # Write the feature table to a file.
    with open(name + "_train.csv", "w") as train_file:
        writer = csv.writer(train_file)
        writer.writerows(train_table)
    print("Training Table Complete")

    if len(test) > 0:
        print("Commence Testing Table Creation")
        # Add feature names and labels to testing feature table
        test_table = [feature_names] + test
        test_table[0] = ['index'] + test_table[0] + ['class']
        train_table[1:] = [[index] + row + [label]
                           for index, row, label in zip(
                               test_indices, test_table[1:], test_labels)]
        with open(name + "_test.csv", "w") as test_file:
            writer = csv.writer(test_file)
            writer.writerows(test_table)
        print("Testing Table Complete")
    else:
        print("Nothing in test table so won't bother.")
    print("Finished")

    return corpus
from flask import Flask, render_template, request
from article import Article

app = Flask(__name__)

posts = [
    Article('title', 'subtitle', 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque sapien velit, aliquet eget commodo nec, auctor a sapien. Nam eu neque vulputate diam rhoncus faucibus. Curabitur quis varius libero. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam placerat sem at mauris suscipit porta. Cras metus velit, elementum sed pellentesque a, pharetra eu eros. Etiam facilisis placerat euismod. Nam faucibus neque arcu, quis accumsan leo tincidunt varius. In vel diam enim. Sed id ultrices ligula. Maecenas at urna arcu. Sed.', 'jatin katyal'),
    Article('title', 'subtitle', 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque sapien velit, aliquet eget commodo nec, auctor a sapien. Nam eu neque vulputate diam rhoncus faucibus. Curabitur quis varius libero. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam placerat sem at mauris suscipit porta. Cras metus velit, elementum sed pellentesque a, pharetra eu eros. Etiam facilisis placerat euismod. Nam faucibus neque arcu, quis accumsan leo tincidunt varius. In vel diam enim. Sed id ultrices ligula. Maecenas at urna arcu. Sed.', 'jatin katyal'),
    Article('title', 'subtitle', 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque sapien velit, aliquet eget commodo nec, auctor a sapien. Nam eu neque vulputate diam rhoncus faucibus. Curabitur quis varius libero. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam placerat sem at mauris suscipit porta. Cras metus velit, elementum sed pellentesque a, pharetra eu eros. Etiam facilisis placerat euismod. Nam faucibus neque arcu, quis accumsan leo tincidunt varius. In vel diam enim. Sed id ultrices ligula. Maecenas at urna arcu. Sed.', 'jatin katyal'),
    Article('title', 'subtitle', 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque sapien velit, aliquet eget commodo nec, auctor a sapien. Nam eu neque vulputate diam rhoncus faucibus. Curabitur quis varius libero. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam placerat sem at mauris suscipit porta. Cras metus velit, elementum sed pellentesque a, pharetra eu eros. Etiam facilisis placerat euismod. Nam faucibus neque arcu, quis accumsan leo tincidunt varius. In vel diam enim. Sed id ultrices ligula. Maecenas at urna arcu. Sed.', 'jatin katyal'),
]

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/articles')
def articles():
    return render_template('articles.html', articles=posts)

@app.route('/article/<int:id>')
def article(id):
    try:
        post = posts[id-1]
        return render_template('article.html', article=post)
    except IndexError:
        return render_template('404.html')
    
@app.route('/new_article', methods=['GET', 'POST'])
def new_article():
    if request.method == "GET":
 def __init__(self, full_url: str, article_id: int):
     self.full_url = full_url
     self.article_id = article_id
     self.article = Article(full_url, article_id)
Example #25
0
                        '-i',
                        dest='filename_in',
                        action='store',
                        default="metadata.yaml",
                        help='input YAML file')
    parser.add_argument('--output',
                        "-o",
                        dest='filename_out',
                        action='store',
                        default="article-metadata.tex",
                        help='output latex file')
    args = parser.parse_args()

    filename_in = args.filename_in
    filename_out = args.filename_out

    # print("Generating latex definitions ({1}) from {0}".format(filename_in, filename_out))

    with open(filename_in, "r") as file:
        article = Article(file.read())

    if len(article.authors) > 0:
        content = generate_latex_metadata(filename_in, article)
        if filename_out is not None:
            with open(filename_out, "w") as file:
                file.write(content)
        else:
            print(content)
    else:
        print("Error! No author found.")
Example #26
0
# coding: utf-8
import os.path, sys
sys.path.append(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
from database import Database
from article import Article
from helper import Helper
from hmm_written_by_me import HiddenMarkovModel
import csv
import nltk
import dill

if __name__ == '__main__':
    module_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    syllables_dictionary = Helper.load_syllables_dictionary()
    article = Article('', 0, syllables_dictionary)

    hmm_by_me_dictionary_path = module_path + '/hmm/hmm_data/hmm_by_me_dictionary_without_cle_all_new_dict.pkl'
    hmm_dictionary = Helper.load_obj(hmm_by_me_dictionary_path)

    invert_dictionary_path = module_path + '/hmm/hmm_data/invert_hmm_by_me_dictionary_without_cle_all_new_dict.pkl'
    invert_hmm_dictionary = Helper.load_obj(invert_dictionary_path)

    hmm_save_path = module_path + '/hmm/hmm_data/unsupervised_hmm_by_me_without_cle_all_new_dict.pkl'
    hmm = Helper.load_obj(hmm_save_path)

    occurrences_data_path = module_path + '/hmm/hmm_data/occurrences.pkl'
    statistic_bigram = Helper.load_obj(occurrences_data_path)

    #test with sub parameter
    bigram_path = module_path + '/hmm/hmm_data/bigram.pkl'
Example #27
0
def create_article_map(directory_path,
                       output_csv_name,
                       k_hours=sim.measure_const.NUM_HOURS):
    """
    Main parsing function. Takes in a directory containing .nml files
    and returns a dictionary with keys that correspond to company symbols,
    and values that are sets of Article objects whose articles are about
    that company. 
    
    Arguments:
        directory_path: A string representing the filepath for directory containing .nml files to parse. 
        output_csv_name: Name of .csv file to output with articles and similarity
            information. 
        k_hours: Argument passed in to filter_old_articles(). Determines
            article filtering. Default is 72, so articles mapped to any
            given company which are at least 72 hours older than the
            current article being parsed will be filtered from the map.
    
    Returns:
        company_article_map: Map from companies to sets of Articles about
            those companies. The set for each company will contain articles that
            were published within k_hours hours of each other. 
    """
    company_article_map = {}
    curr_file_str = ""

    header_df = pd.DataFrame(columns=[
        "company", "headline", "time", "id", "old_score", "closest_neighbor",
        "is_reprint", "is_recombination", "num_unique_words"
    ])
    header_df = pd.DataFrame(columns=[
        "DATE_EST", "STORY_ID", "TICKER", "HEADLINE", "STORY_LENGTH",
        "CLOSEST_ID", "CLOSEST_SCORE", "TOTAL_OVERLAP", "IS_OLD", "IS_REPRINT",
        "IS_RECOMBINATION"
    ])
    header_df.to_csv(output_csv_name, index=False)

    f = open(output_csv_name, "a")
    csv_writer = csv.writer(f)

    directory = os.fsencode(directory_path)

    nml_files = []
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if ".nml" not in filename:
            continue
        else:
            nml_files.append(directory_path + filename)

    for filename in sorted(nml_files):
        with open(filename) as myfile:
            for next_line in myfile:
                curr_file_str += next_line
                if next_line == "</doc>\n":

                    xml_elem = ET.fromstring(curr_file_str)
                    company = xml_elem.find(".//djn-company-sig")
                    if company is None:
                        curr_file_str = ""
                        continue

                    md5_hash = xml_elem.attrib['md5']
                    timestamp = xml_elem.find(
                        ".//djn-mdata").attrib['display-date']
                    headline = xml_elem.find(".//headline").text.lstrip()
                    all_text = xml_elem.find(".//text")
                    article_text = "".join(all_text.itertext())
                    text_stemmed_filtered = sim.stem_and_filter(article_text)
                    num_unique_words = len(text_stemmed_filtered)

                    for c in company:
                        if c.attrib.get('about',
                                        False) != 'Y' or "." in c.text:
                            curr_file_str = ""
                            continue
                        company = c.text

                        new_article = Article(company, timestamp, headline,
                                              text_stemmed_filtered, md5_hash)
                        company_articles = filter_old_articles(
                            company_article_map, new_article, k_hours)

                        if len(company_articles) == 0:
                            company_articles.add(new_article)
                            company_article_map[company] = company_articles
                            curr_file_str = ""
                            continue
                        else:
                            old, closest_neighbor, closest_id = sim.compute_sim_measure(
                                new_article, company_articles)
                            new_row = [
                                new_article.timestamp, new_article.md5_id,
                                new_article.company, new_article.headline,
                                num_unique_words, closest_id, closest_neighbor,
                                old,
                                sim.is_old_news(old),
                                sim.is_reprint(old, closest_neighbor),
                                sim.is_recombination(old, closest_neighbor)
                            ]
                            csv_writer.writerow(new_row)

                            company_articles.add(new_article)
                            company_article_map[company] = company_articles

                        curr_file_str = ""

    f.close()
    return company_article_map
Example #28
0
import os
from article import Article

articles = {}
for entry in os.listdir('./accesses'):
    articles[entry[0:23]] = Article(entry[0:23])

articles_last_year = {}
for entry in articles:
    if (articles[entry].year >= 2014):
        articles_last_year[entry] = articles[entry]

accesses = {}
for entry in articles_last_year:
    accesses[entry] = articles_last_year[entry].accesses_in_time_interval(
        '2015-04-01', '2016-04-01')

my_articles = sorted(accesses, key=accesses.__getitem__, reverse=True)
for i in range(0, 10):
    print(my_articles[i] + '\t' + str(accesses[my_articles[i]]))
Example #29
0
def get_test_article():
    article = Article(0, "Test Article", test_article_url, "test article", "test", 162)
    return [article]
Example #30
0
def main():
    script_path = get_script_path()

    blog_path = script_path + "/../../../blog"

    argv = sys.argv[1:]
    try:
        opts, _ = getopt.getopt(argv, "p:h")  # 短选项模式
    except:
        help()
        exit(1)

    for opt, arg in opts:
        if opt in ['-p']:
            if not os.path.exists(arg):
                print("Error: bad blog path")
                exit(1)
            blog_path = arg

    article_path = script_path + "/../articles"
    files = os.listdir(article_path)

    for file in files:
        # read file
        absfile = article_path + "/" + file
        with open(absfile) as f:
            data = json.load(f)
            if not data:
                print("Error: bad article file: ", file)
                exit(1)

        articles = data['articles']
        common_info = {}
        for k, v in data.items():
            if k == 'articles':
                continue

            common_info[k] = v

        articles_hash = {}
        for article in articles:
            articles_hash[article['dst-dir']] = True

        # NOTE: clean dst path
        blog_post_path = blog_path + "/content/post"
        dst_path = blog_post_path + '/' + common_info.get("dst-path", None)
        if not os.path.exists(dst_path):
            os.makedirs(dst_path)
        else:
            dirs = os.listdir(dst_path)
            for dir in dirs:
                if not articles_hash.get(dir, None):
                    print("[+] removing ", dst_path)
                    shutil.rmtree(dst_path)

        # process common category
        categories = common_info.get('categories', None)
        if isinstance(categories, list):
            for category in categories:
                ct = Category(blog_path, category)
                ct.add()

        for article in articles:
            # process category
            categories = article.get('categories', None)
            if isinstance(categories, list):
                for category in categories:
                    ct = Category(blog_path, category)
                    ct.add()

            # process article
            atc = Article(script_path, blog_path, article, common_info)
            atc.convert()