def demo(input=""): print("Reading the documents (from json dataset)... input should be 'Data/jsonFile#jsonIndex") jsonFile = input.split("#")[0] jsonIndex = input.split("#")[1] cliqueOfArticles = functions.readJsonFile(jsonFile) contents = cliqueOfArticles[jsonIndex]["contents"] publications = cliqueOfArticles[jsonIndex]["publications"] titles = cliqueOfArticles[jsonIndex]["sentences"] print("Initialize object...") gDoc=graphDoc.graphDocuments(contents,publications,titles) # initialize object with documents and publication classes e.g. cnn, fox print("Extracting sentence structure...") gDoc.sentenceProcess(withGA=True, output="temp/sentences.pkl") print("Computing sentence similarities...") gDoc.computeSentenceDistances(similarityFunction = "cosine") print("Keeping only the most important sentence-to-sentence similarities (thresholding)...") gDoc.reduceSentenceSimilarityFrame(pA=85,pB=93) print("Create graph... (no plotting)") gDoc.computeNetwork(plot=False,cliqueEdges=[]) print("Clique finder in the graph...") gDoc.cliqueFinder(output="temp/cliquesFinal.json",orderby="median tf-idf score")
def load_clique_results(glob_expression): file_matches = glob.glob(glob_expression) result = {} for f_name in file_matches: f_content = functions.readJsonFile(f_name) result[f_name] = f_content return result
def all_clique_processing(jsonFile): # Find cross-referenced pieces if information print("Reading the documents") cliqueOfArticles = functions.readJsonFile(jsonFile) items = cliqueOfArticles.items() pool = Pool(processes=8) for clique_id in tqdm(pool.imap_unordered(fn_wrap, items), desc='outer loop'): print(clique_id, 'done')
def stats(jsonFile): # Computes some stats about initial cliques print('computing stats about input') cliqueOfArticles = functions.readJsonFile(jsonFile) size_acc = 0 size_distrib = [] clique_sim_distrib = [] per_outlet = defaultdict(lambda: 0) for k, v in cliqueOfArticles.items(): publications = v['publications'] size_acc += len(publications) size_distrib.append(float(len(publications))) clique_sim_distrib.append(v['score']) for o in publications: per_outlet[o] += 1 print('total articles', size_acc) print('#cliques', len(cliqueOfArticles)) print('average clique len', size_acc / len(cliqueOfArticles)) print(size_distrib) plot.plot_distribution(size_distrib, 'temp/fig_clique_size_distrib.png') plot.plot_distribution(clique_sim_distrib, 'temp/fig_clique_sim_distrib.png') print(per_outlet)
'based on the authors\' [demo](http://fairnews.ewi.tudelft.nl/InCredible/) ' 'and [source code](https://github.com/dbountouridis/InCredible). ') st.sidebar.info( 'The code for this demo is [here](https://github.com/MartinoMensio/InCredible)' ) st.sidebar.title('Instructions') st.sidebar.info( 'Select a document clique to load: each one represents a different story') st.sidebar.info('Select a main source to see the corresponding article') doc_cliques_input_file = st.text_input('Document cliques input file:', 'Data/dataset.json') output_path = st.text_input('Path where the computation results are:', 'temp/cliques_GA/') document_cliques = functions.readJsonFile(doc_cliques_input_file) document_cliques_ids = list(document_cliques.keys()) default = '0.7996630192184154-7-6180' chosen_doc_clique_id = st.selectbox('Document clique to load:', document_cliques_ids, document_cliques_ids.index(default)) chosen_doc_clique = document_cliques[chosen_doc_clique_id] clique_outputs = functions.readJsonFile( Path(output_path) / f'cliques_{chosen_doc_clique_id}.json') st.text(f'Clique {chosen_doc_clique_id} selected') st.text('Titles:\n\n' + '\n'.join([ f'{p}--> {t}' for p, t in zip(chosen_doc_clique['publications'], chosen_doc_clique['sentences'])