def wrapper():
  '''
  Wrapper for main program.

  '''

  collect.main()
 def download_and_process(self, project, ncfile):
     downloaded_file = os.path.join(self.download, ncfile)
     output_file = os.path.join(self.output, project, ncfile)
     do_download = not os.path.isfile(downloaded_file)
     main(self.output, self.download, do_download, [project.lower()], self.csv, filesubset=[ncfile.lower()])
     assert os.path.isfile(downloaded_file)
     assert os.path.isfile(output_file)
     return output_file
Esempio n. 3
0
    def run(self):
        import collect
        try:
            collect.main(self.code, self.page)
        except:
            self.signal.emit("论坛帖目录收集错误。")
            return self.signal.emit("分析失败。")
        self.signal.emit("论坛帖目录收集完成。(1/5)")
        import contain
        try:
            os.mkdir('./tmp/posts')
        except FileExistsError:
            pass
        article_title = pd.read_csv('./tmp/_title.csv')
        for o in range(article_title.shape[0]):
            posts = contain.get_contain(self.code, article_title['url'][o], article_title['em'][o])
            pd.DataFrame(posts).to_csv('./tmp/posts/' + str(o) + '.csv', index=False)
            self.signal.emit("论坛帖获取进度:" + str(o + 1) + "/" + str(article_title.shape[0]))
        self.signal.emit("论坛帖收集完成。(2/5)")
        import nlp
        from numpy import average
        self.signal.emit("NLP模块加载完成。(3/5)")
        count = nlp.emotion_advanced()
        highfreq = nlp.word_freq(count)
        self.wordart.emit(highfreq)
        self.signal.emit("高频词统计完成。(4/5)")
        highfreq = dict(zip(highfreq.keys(), [0] * len(highfreq.keys())))
        boson = nlp.boson_load()
        highfreq = nlp.fix_emo(highfreq, boson)

        self.wordart2.emit(highfreq)
        emt = average(list(highfreq.values()))
        self.signal.emit("情感比对统计完成。(5/5)")
        self.signal.emit("平均情感积极性:" + str(emt))
        if emt >= 6:
            self.signal.emit("数据显示:投资者对这个股票非常乐观。")
        elif 4 <= emt < 6:
            self.signal.emit("数据显示:投资者对这个股票持乐观态度。")
        elif 2 <= emt < 4:
            self.signal.emit("数据显示:投资者对这个股票比较乐观。")
        elif 0.5 <= emt < 2:
            self.signal.emit("数据显示:投资者对这个股票中立偏积极。")
        elif -0.5 < emt < 0.5:
            self.signal.emit("数据显示:投资者对这个股票持中立态度。")
        elif -2 < emt <= -0.5:
            self.signal.emit("数据显示:投资者对这个股票中立偏消极。")
        elif -4 < emt <= -2:
            self.signal.emit("数据显示:投资者对这个股票比较消极。")
        elif -6 < emt <= -4:
            self.signal.emit("数据显示:投资者对这个股票持消极态度。")
        elif emt <= -6:
            self.signal.emit("数据显示:投资者对这个股票非常消极。")
        else:
            self.signal.emit("未得出结果。")
        self.signal.emit("高频词:" + " ".join(list(highfreq.keys())))
        return self.signal.emit("分析结束。")
Esempio n. 4
0
def loaddata(data_dir="data"):
    now = datetime.datetime.now()
    path = os.path.join(
        data_dir, f"{now.year}-{now.month}-{now.day}-emploitogo-data.json")
    if not os.path.exists(path):
        print("Jobs not found...")
        import collect
        collect.main()
    with open(path, "r") as f:
        jobs = json.load(f)
    print("Total jobs:", len(jobs))
    return jobs
 def download_and_process(self, project, ncfile):
     downloaded_file = os.path.join(self.download, project, ncfile)
     output_file = os.path.join(self.output, project, ncfile)
     do_download = not os.path.isfile(downloaded_file)
     main(self.output,
          self.download,
          do_download, [project.lower()],
          self.csv,
          filesubset=[ncfile.lower()])
     assert os.path.isfile(downloaded_file)
     assert os.path.isfile(output_file)
     return output_file
Esempio n. 6
0
def test(size, folder, index=None, mode=bins.default_mode):
    """Compares url's distance to the reference distrobtion with the threshold. If index is specified existing urls are used if not new requests are recorded.q"""
    path = size + "/" + folder + "/"

    if index == None:
        index = 1
        while os.path.exists(path + str(index)):
            index += 1

        col.main(path + str(index), sample_size=size)

    with open(path + str(index), "rb") as f:
        urls = pickle.load(f)

    return test_raw(urls, size, mode)
Esempio n. 7
0
def main():
    longitud, number = collect.main()
    average = cluster.main()
    positive, negative, ejemplo1, ejemplo2 = classify.main()
    f = open("summary.txt", "w", encoding="utf-8")
    f.write("Number of users collected: %d\n" % longitud)
    f.write("Number of messages collected: %d\n" % number)
    f.write("Average number of users per community: %d\n" % average)
    f.write("Number of instances per class found: %d, %d\n" %
            (positive, negative))
    f.write("%s\n" % str(ejemplo1))
    f.write("%s\n" % str(ejemplo2))
    f.close()
    f2 = open("description.txt", "w", encoding="utf-8")
    f2.write(
        "Study of the impact caused by a tweet from a friend of DowJones in the stock market.\n"
    )
    f2.write(
        "I do a research on TOP 100 friends on Twitter of DowJones account.\n First, I downloaded the values ​​for SP500 for a couple of weeks each minute from finance.google.com.\n With each tweet from DowJones friends during that time, I saw the impact of that tweet on the stock market, calculating the value of the stock market at that time and subtract the value of the stock within 5 minutes of difference. Then I classified as positive if the subtraction is positive and say that type of tweet has a positive impact.\n Otherwise, if the subtraction is negative, the impact of the tweet had negative impact.\n"
    )
    f2.write(
        "In the classifier, with the training tweets I made a cluster of words with 10 means.\n The number of words in each cluster is one of the features that I have entered in my classifier. Another feature is the time each tweet was published.\n With this I have created a classifier that predicts the impact that the tweet of analysts will have on the stock market in SP500.\n"
    )
    f2.write(
        "In the clusters, I have seen the number of communities that can exist. I have found that there are small communities.\n But at no time does it become a single community, like all stock analysts together in one cluster. Moreover I saw where each of the analysts friends was located. I have observed that it is not only the United States, the main country in the clusters, also we have United Kingdom, Iran or even Australia have some importance in these clusters.\n"
    )
    f2.close()
Esempio n. 8
0
def main(collect_new, enable_graphs, enable_emulations):

    if collect_new:
        print "Press enter to collect reference urls"
        raw_input()
        collect.main('1000/censor/reference_urls')

        print "Press enter to collect fakes."
        raw_input()
        for i in range(10):
            collect.main('1000/fakes/' + str(i))

        print "Press enter to collect normal packets."
        raw_input()
        for i in range(10):
            collect.main('1000/normals/' + str(i))

    for mode in bins.modes.values():
        print "Creating " + mode + " histogram."
        hist = bins.sort_file("1000/censor/reference_urls", mode)
        with open("1000/censor/reference_" + mode + "_bins", 'wb') as f:
            print str(hist)[:100], "...", str(hist)[-100:]
            pickle.dump(hist, f)

    for mode in bins.modes.values():
        print "Calculating " + mode + " threshold"
        results = find_threshold.calculate(mode=mode,
                                           enable_graphs=enable_graphs)
        with open("1000/censor/threshold_" + mode, "w") as f:
            print str(results)
            f.write(str(results))

    if enable_emulations:
        for mode in bins.modes.values():
            if mode != bins.modes["INTER_SLASH_DIST"]:
                print "Testing emulation in " + mode + " mode."
                emulator.test(mode)

        for mode in bins.modes.values():
            if mode != bins.modes["INTER_SLASH_DIST"]:
                print "Emulating example ciphertexts in " + mode + " mode."
                for i in range(10):
                    urls = emulator.get_emulations(mode=mode)
                    with open("1000/emulated/" + mode + "/" + str(i),
                              "wb") as f:
                        pickle.dump(urls, f)
Esempio n. 9
0
def main():
    with open('./summary.txt', 'w') as f:
        sys.stdout = f
        collect.main()
        cluster.main()
        classify.main()
Esempio n. 10
0
import collect
import cluster
import classify

collect.main()
cluster.main()
classify.main()

filename = "summary.txt"
file = open(filename, 'w')
file.write('Number of users collected: ' + str(collect.a) + '\n')
file.write('Number of messages collected: ' + str(collect.b) + '\n')
file.write('Number of communities discovered: ' + str(cluster.c) + '\n')
file.write('Average number of users per community: ' + str(cluster.d) + '\n')
file.write('Number of instances per class found: ' + str(classify.out) + '\n')
file.write('One example from each class: ' + str(cluster.instance1) + '\n' +
           str(classify.instance2) + '\n' + str(classify.instance3) + '\n')

file.close()
Esempio n. 11
0
def main(argv):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    merge_file_name = os.path.join(current_dir, u'merge.py')
    collect_web_property_file_name = os.path.join(
        current_dir, u'collect_web_property.py')
    profile_dir = os.path.join(current_dir, u'proj/profile')
    command_dir = os.path.join(current_dir, u'proj/command')
    cs_conf_dir = os.path.join(current_dir, u'proj/client_secrets')
    cs_data_dir = os.path.join(current_dir, u'proj/client_secrets')
    output_dir = os.path.join(current_dir, u'proj/output')
    collect_output_dir = os.path.join(current_dir, u'proj/output/collect')
    merge_output_dir = os.path.join(current_dir, u'proj/output/merge')
    collect_log_dir = os.path.join(current_dir, u'proj/log/collect')
    merge_log_dir = os.path.join(current_dir, 'proj/log/merge')

    # the day before yesterday
    collect_date = datetime.date.today() - datetime.timedelta(days=1)

    if len(argv) > 0:
        argv_start_index = 0
        reg_date = r'[0-9]{4}-[0-9]{2}-[0-9]{2}'
        if not re.match(reg_date, argv[argv_start_index]):
            print u'collect date error, reg:[0-9]{4}-[0-9]{2}-[0-9]{2}'
            exit(1)
        collect_date_array = argv[argv_start_index].split('-')
        collect_date = datetime.date(int(collect_date_array[0]), int(
            collect_date_array[1]), int(collect_date_array[2]))

    # collect
    if os.path.exists(os.path.join(collect_log_dir, str(collect_date))):
        shutil.rmtree(os.path.join(collect_log_dir, str(collect_date)))
    for profile_file in os.listdir(profile_dir):
        if os.path.isfile(os.path.join(profile_dir, profile_file)) and profile_file.find(u'.txt') > 0:
            cs_file_name = u'cs_1'
            if profile_file in conf._cs_conf:
                cs_file_name = conf._cs_conf[profile_file]
            cs_conf_file = os.path.join(cs_conf_dir, cs_file_name + u'.json')
            cs_data_file = os.path.join(cs_data_dir, cs_file_name + u'.dat')
            cmd_file = os.path.join(command_dir, u'cmd.txt')
            profile_file_path = os.path.join(profile_dir, profile_file)
            output_dir = os.path.join(collect_output_dir, str(
                collect_date), profile_file[:profile_file.find('.txt')])
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print u'Start Collect', profile_file
            collect.main(
                [collect_web_property_file_name, profile_file_path, cmd_file,
                 cs_conf_file, cs_data_file, collect_log_dir, output_dir, str(collect_date)])
    print u'All Collect Succ.'

    # merge
    if os.path.exists(os.path.join(merge_log_dir, str(collect_date))):
        shutil.rmtree(os.path.join(merge_log_dir, str(collect_date)))
    merge_commands = []
    for profile_file in os.listdir(profile_dir):
        if os.path.isfile(os.path.join(profile_dir, profile_file)) and profile_file.find(u'.txt') > 0:
            profile_file_path = os.path.join(profile_dir, profile_file)
            data_dir = os.path.join(collect_output_dir, str(
                collect_date), profile_file[:profile_file.find('.txt')])
            output_dir = os.path.join(merge_output_dir, str(collect_date))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            comp_command = [
                u'python', merge_file_name, profile_file_path, data_dir, output_dir]
            log_file = util.create_log_file_path(
                os.path.join(merge_log_dir, str(collect_date)), profile_file[:profile_file.find('.')])
            merge_commands.append({u'command': comp_command, u'log': log_file})

    print u'Start Merge'
    util.process_pump(merge_commands, 3, 5, 0.5)
    print u'All Merge Succ.'

    # last merge
    merged_data_dir = os.path.join(merge_output_dir, str(collect_date))
    last_merge_dir = os.path.join(
        merge_output_dir, str(collect_date), u'last_merge')
    if os.path.exists(last_merge_dir):
        shutil.rmtree(last_merge_dir)
    os.makedirs(last_merge_dir)
    last_merge_index_file_path = os.path.join(
        last_merge_dir, u'last_merge.txt')
    if os.path.exists(last_merge_index_file_path):
        os.remove(last_merge_index_file_path)
    f = codecs.open(last_merge_index_file_path, encoding='utf-8', mode='w')
    for merged_file in os.listdir(merged_data_dir):
        if os.path.isfile(os.path.join(merged_data_dir, merged_file)) and merged_file.find(u'.json') > 0:
            f.write(merged_file[:merged_file.find(u'.')])
            f.write(u'\r\n')
    f.close()

    if os.path.exists(os.path.join(merge_log_dir, str(collect_date), u'last_merge')):
        shutil.rmtree(
            os.path.join(merge_log_dir, str(collect_date), u'last_merge'))
    comp_command = [u'python', merge_file_name,
                    last_merge_index_file_path, merged_data_dir, last_merge_dir]
    log_file = util.create_log_file_path(
        os.path.join(merge_log_dir, str(collect_date), u'last_merge'), u'last_merge')
    print u'Start Last Merge'
    util.process_pump(
        [{u'command': comp_command, u'log': log_file}], 3, 5, 0.5)
    print u'Last Merge Succ.'

    print u'All Succ.'